mirror of
https://github.com/syssec-utd/pylingual.git
synced 2026-05-10 18:39:03 -07:00
rename
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import logging
|
||||
import pathlib
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingParameters:
|
||||
batch_size: int
|
||||
epochs: int
|
||||
learning_rate: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class SegmentationConfiguration:
|
||||
base_repo_name: str
|
||||
dataset_repo_name: str
|
||||
pretrained_mlm_repo_name: str
|
||||
cache_dir: pathlib.Path
|
||||
max_token_length: int
|
||||
dataset_percentage: int
|
||||
mlm_training_parameters: TrainingParameters
|
||||
segmentation_training_parameters: TrainingParameters
|
||||
|
||||
@property
|
||||
def tokenizer_repo_name(self):
|
||||
return self.base_repo_name + "-tokenizer"
|
||||
|
||||
@property
|
||||
def tokenizer_json_path(self):
|
||||
return self.cache_dir / "tokenizers" / self.tokenizer_repo_name / "tokenizer.json"
|
||||
|
||||
@property
|
||||
def tokenized_dataset_repo_name(self):
|
||||
return self.dataset_repo_name + "-tokenized"
|
||||
|
||||
@property
|
||||
def mlm_repo_name(self):
|
||||
return self.base_repo_name + "-mlm"
|
||||
|
||||
@property
|
||||
def mlm_dir(self):
|
||||
return self.cache_dir / "models" / self.mlm_repo_name
|
||||
|
||||
@property
|
||||
def segmenter_repo_name(self):
|
||||
return self.base_repo_name + "-segmenter"
|
||||
|
||||
@property
|
||||
def segmenter_dir(self):
|
||||
return self.cache_dir / "models" / self.segmenter_repo_name
|
||||
|
||||
@property
|
||||
def dataset_dir(self):
|
||||
return self.cache_dir / "datasets" / self.dataset_repo_name
|
||||
|
||||
def __post_init__(self):
|
||||
self.cache_dir = pathlib.Path(self.cache_dir)
|
||||
|
||||
|
||||
def parse_segmentation_config_json(json_file_path: pathlib.Path, logger: Optional[logging.Logger] = None) -> SegmentationConfiguration:
|
||||
if not json_file_path.exists():
|
||||
raise FileNotFoundError(f"{json_file_path} does not exist")
|
||||
|
||||
if logger:
|
||||
logger.info(f"Loading model description from {json_file_path}...")
|
||||
|
||||
with json_file_path.open() as json_file:
|
||||
segmentation_config_dict = json.load(json_file)
|
||||
|
||||
segmentation_config_dict["mlm_training_parameters"] = TrainingParameters(**segmentation_config_dict["mlm_training_parameters"])
|
||||
segmentation_config_dict["segmentation_training_parameters"] = TrainingParameters(**segmentation_config_dict["segmentation_training_parameters"])
|
||||
return SegmentationConfiguration(**segmentation_config_dict)
|
||||
@@ -0,0 +1,152 @@
|
||||
import ast
|
||||
import functools
|
||||
import os
|
||||
import pathlib
|
||||
import click
|
||||
|
||||
from datasets import load_dataset
|
||||
from huggingface_hub import hf_hub_download
|
||||
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
|
||||
from pylingual.segmentation.sliding_window import sliding_window
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
|
||||
bytecode_separator = " <SEP> "
|
||||
|
||||
|
||||
def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
|
||||
tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
|
||||
|
||||
tokenizer_file = hf_hub_download(repo_id=tokenizer_repo_name, filename="tokenizer.json", token=True, cache_dir=str(tokenizer_dir))
|
||||
tokenizer = PreTrainedTokenizerFast(
|
||||
tokenizer_file=tokenizer_file,
|
||||
unk_token="[UNK]",
|
||||
pad_token="[PAD]",
|
||||
cls_token="[CLS]",
|
||||
sep_token="[SEP]",
|
||||
mask_token="[MASK]",
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
# we need to make sure we align all the labels with the proper words.
|
||||
def align_labels_with_tokens(labels, word_ids):
|
||||
label_names = ["B", "I", "E"]
|
||||
id2label = {str(i): label for i, label in enumerate(label_names)}
|
||||
label2id = {v: k for k, v in id2label.items()}
|
||||
|
||||
new_labels = []
|
||||
current_word = None
|
||||
for word_id in word_ids:
|
||||
if word_id != current_word:
|
||||
# Start of a new word!
|
||||
current_word = word_id
|
||||
label = -100 if word_id is None else int(label2id[labels[word_id]])
|
||||
new_labels.append(label)
|
||||
elif word_id is None:
|
||||
# Special token
|
||||
new_labels.append(-100)
|
||||
else:
|
||||
# Same word as previous token
|
||||
label = int(label2id[labels[word_id]])
|
||||
new_labels.append(label)
|
||||
return new_labels
|
||||
|
||||
|
||||
# the process function used for tokenize the dataset
|
||||
def tokenize_and_align_labels(tokenizer: PreTrainedTokenizerFast, max_length: int, examples):
|
||||
MAX_WINDOW_LENGTH = 512
|
||||
STEP_SIZE = 128
|
||||
|
||||
# parse the strings into lists to better work with the bytecode and boundaries
|
||||
parsed_bc = [(codeobj.split(" <SEP> "), ast.literal_eval(bounds)) for codeobj, bounds in zip(examples["bytecode"], examples["boundary"])]
|
||||
|
||||
codeobj_tokens = []
|
||||
|
||||
# count the tokens for each bytecode instruction in a codeobj
|
||||
for codeobj, bounds in parsed_bc:
|
||||
token_list = []
|
||||
|
||||
for bc, bounds in zip(codeobj, bounds):
|
||||
token_list.append(((bc, bounds), len(tokenizer(bc)[0])))
|
||||
|
||||
codeobj_tokens.append(token_list)
|
||||
|
||||
windows = [sliding_window(codeobj, MAX_WINDOW_LENGTH, STEP_SIZE) for codeobj in codeobj_tokens]
|
||||
|
||||
# remake examples using our windows
|
||||
examples["boundary"] = []
|
||||
examples["bytecode"] = []
|
||||
|
||||
# go through each window
|
||||
for window in windows:
|
||||
for item in window:
|
||||
# where we will temporarily store our bytecode and bounds
|
||||
bytecode = []
|
||||
bounds = []
|
||||
|
||||
for bc in item[0]:
|
||||
bytecode.append(bc[0])
|
||||
bounds.append(bc[1])
|
||||
|
||||
# append it into examples
|
||||
examples["bytecode"].append(bytecode_separator.join(bytecode))
|
||||
examples["boundary"].append(str(bounds))
|
||||
|
||||
tokenized_inputs = tokenizer(
|
||||
examples["bytecode"],
|
||||
truncation=True,
|
||||
max_length=max_length,
|
||||
)
|
||||
|
||||
all_labels = examples["boundary"]
|
||||
new_labels = []
|
||||
for i, labels in enumerate(all_labels):
|
||||
labels = labels.replace("'", "").strip("][").split(", ")
|
||||
word_ids = tokenized_inputs.word_ids(i)
|
||||
labels_len = len(labels)
|
||||
max_word_id = word_ids[-2]
|
||||
# for those data might cause error due to the incorrect tokenization, we fix the data exceed-length issue and
|
||||
# leave them here as some noisy data.
|
||||
if max_word_id >= labels_len:
|
||||
new_labels.append([-100] * max_word_id)
|
||||
else:
|
||||
new_labels.append(align_labels_with_tokens(labels, word_ids))
|
||||
|
||||
tokenized_inputs["labels"] = new_labels
|
||||
|
||||
return tokenized_inputs
|
||||
|
||||
|
||||
def tokenize_segmentation_dataset(config: SegmentationConfiguration):
|
||||
raw_dataset = load_dataset(config.dataset_repo_name, token=True, cache_dir=str(config.dataset_dir))
|
||||
|
||||
tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
|
||||
prepped_tokenize_and_align_labels = functools.partial(tokenize_and_align_labels, tokenizer, config.max_token_length)
|
||||
|
||||
# tokenize input dataset
|
||||
column_names = raw_dataset["train"].column_names
|
||||
tokenized_datasets = raw_dataset.map(
|
||||
prepped_tokenize_and_align_labels,
|
||||
batched=True,
|
||||
remove_columns=column_names,
|
||||
num_proc=os.cpu_count(),
|
||||
desc="Tokenizing datasets",
|
||||
)
|
||||
|
||||
tokenized_datasets.push_to_hub(
|
||||
config.tokenized_dataset_repo_name,
|
||||
private=True,
|
||||
)
|
||||
|
||||
|
||||
@click.command(help="Script to tokenize the segmentation dataset given a segmentation json.")
|
||||
@click.argument("json_path", type=str)
|
||||
def main(json_path: str):
|
||||
json_file_path = pathlib.Path(json_path)
|
||||
segmentation_config = parse_segmentation_config_json(json_file_path)
|
||||
tokenize_segmentation_dataset(segmentation_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,195 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import click
|
||||
|
||||
from datasets import load_dataset
|
||||
from huggingface_hub import hf_hub_download, repo_exists
|
||||
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
|
||||
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling, PreTrainedTokenizerFast, RobertaConfig, RobertaForMaskedLM, Trainer, TrainingArguments
|
||||
|
||||
from pylingual.segmentation.sliding_window import sliding_window
|
||||
|
||||
bytecode_separator = " <SEP> "
|
||||
|
||||
|
||||
def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
|
||||
tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
|
||||
|
||||
tokenizer_file = hf_hub_download(
|
||||
repo_id=tokenizer_repo_name,
|
||||
filename="tokenizer.json",
|
||||
token=True,
|
||||
cache_dir=str(tokenizer_dir),
|
||||
)
|
||||
tokenizer = PreTrainedTokenizerFast(
|
||||
tokenizer_file=tokenizer_file,
|
||||
unk_token="[UNK]",
|
||||
pad_token="[PAD]",
|
||||
cls_token="[CLS]",
|
||||
sep_token="[SEP]",
|
||||
mask_token="[MASK]",
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def load_tokenized_train_dataset(
|
||||
dataset_repo_name: str,
|
||||
tokenizer: PreTrainedTokenizerFast,
|
||||
max_length: int,
|
||||
cache_dir: pathlib.Path,
|
||||
):
|
||||
dataset_dir = cache_dir / "datasets" / dataset_repo_name
|
||||
raw_dataset = load_dataset(dataset_repo_name, token=True, cache_dir=dataset_dir, split="train")
|
||||
|
||||
# tokenize the input data
|
||||
column_names = raw_dataset.column_names
|
||||
|
||||
def tokenize(examples):
|
||||
# sliding window compatibility
|
||||
MAX_WINDOW_LENGTH = 512
|
||||
STEP_SIZE = 128
|
||||
|
||||
# parse the strings into lists to better work with the bytecode and boundaries
|
||||
parsed_bc = [codeobj.split(" <SEP> ") for codeobj in examples["bytecode"]]
|
||||
|
||||
codeobj_tokens = []
|
||||
|
||||
# count the tokens for each bytecode instruction in a codeobj
|
||||
for codeobj in parsed_bc:
|
||||
token_list = []
|
||||
|
||||
for bytecode in codeobj:
|
||||
token_list.append((bytecode, len(tokenizer(bytecode)[0])))
|
||||
|
||||
codeobj_tokens.append(token_list)
|
||||
|
||||
windows = [sliding_window(codeobj, MAX_WINDOW_LENGTH, STEP_SIZE) for codeobj in codeobj_tokens]
|
||||
|
||||
# remake examples using our windows
|
||||
examples["bytecode"] = []
|
||||
|
||||
# go through each window
|
||||
for window in windows:
|
||||
for item in window:
|
||||
# where we will temporarily store our bytecode and bounds
|
||||
bytecode = []
|
||||
|
||||
for bc in item[0]:
|
||||
bytecode.append(bc)
|
||||
|
||||
# append to examples
|
||||
examples["bytecode"].append(bytecode_separator.join(bytecode))
|
||||
|
||||
return tokenizer(examples["bytecode"], max_length=max_length, truncation=True)
|
||||
|
||||
tokenized_dataset = raw_dataset.map(
|
||||
tokenize,
|
||||
batched=True,
|
||||
remove_columns=column_names,
|
||||
num_proc=os.cpu_count(),
|
||||
desc="Tokenizing datasets",
|
||||
)
|
||||
|
||||
return tokenized_dataset
|
||||
|
||||
|
||||
def load_pretrained_mlm(
|
||||
pretrained_mlm_repo_name: str,
|
||||
tokenizer_embedding_length: int,
|
||||
cache_dir: pathlib.Path,
|
||||
) -> AutoModelForMaskedLM:
|
||||
# load a basic pretrained BERT model
|
||||
pretrained_mlm_dir = cache_dir / "models" / pretrained_mlm_repo_name
|
||||
model = AutoModelForMaskedLM.from_pretrained(pretrained_mlm_repo_name, cache_dir=str(pretrained_mlm_dir))
|
||||
|
||||
# resize token embeddings to fit the model
|
||||
model.resize_token_embeddings(tokenizer_embedding_length)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def initialize_untrained_mlm(
|
||||
tokenizer_embedding_length: int,
|
||||
max_token_length: int,
|
||||
) -> RobertaForMaskedLM:
|
||||
# initialize untrained RoBERTa model
|
||||
# most configuration options set to match https://huggingface.co/microsoft/codebert-base/blob/main/config.json for direct comparison
|
||||
model_config = RobertaConfig(
|
||||
max_position_embeddings=max_token_length, # INPUT LENGTH LIMIT
|
||||
vocab_size=tokenizer_embedding_length,
|
||||
layer_norm_eps=1e-05,
|
||||
type_vocab_size=1,
|
||||
)
|
||||
model = RobertaForMaskedLM(model_config)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def train_mlm(config: SegmentationConfiguration):
|
||||
if repo_exists(config.base_repo_name):
|
||||
logging.error(f"{config.base_repo_name} has already exists")
|
||||
exit(1)
|
||||
|
||||
using_pretrained_model = bool(config.pretrained_mlm_repo_name)
|
||||
# train model, for now the configuration comes from a regular T5 translation model.
|
||||
training_args = TrainingArguments(
|
||||
output_dir=str(config.mlm_dir),
|
||||
num_train_epochs=config.mlm_training_parameters.epochs,
|
||||
per_device_train_batch_size=config.mlm_training_parameters.batch_size,
|
||||
save_steps=1000,
|
||||
save_total_limit=5,
|
||||
prediction_loss_only=True,
|
||||
push_to_hub=True,
|
||||
hub_model_id=config.mlm_repo_name,
|
||||
hub_private_repo=True,
|
||||
ddp_backend="nccl",
|
||||
ddp_find_unused_parameters=using_pretrained_model, # only look for unused parameters in pretrained models
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
|
||||
|
||||
# Set DataCollator for MLM task, set the probability of masking.
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
|
||||
|
||||
if using_pretrained_model:
|
||||
pretrained_mlm = load_pretrained_mlm(config.pretrained_mlm_repo_name, len(tokenizer), config.cache_dir)
|
||||
else:
|
||||
pretrained_mlm = initialize_untrained_mlm(len(tokenizer), config.max_token_length + 2)
|
||||
|
||||
tokenized_training_data = load_tokenized_train_dataset(config.dataset_repo_name, tokenizer, config.max_token_length, config.cache_dir)
|
||||
|
||||
# Hugging face trainer: a Trainer class to fine-tune pretrained models
|
||||
trainer = Trainer(
|
||||
model=pretrained_mlm,
|
||||
args=training_args,
|
||||
data_collator=data_collator,
|
||||
train_dataset=tokenized_training_data,
|
||||
)
|
||||
|
||||
# Training
|
||||
trainer.train()
|
||||
|
||||
if int(os.environ["LOCAL_RANK"]) == 0:
|
||||
# Save the model
|
||||
trainer.save_model(config.mlm_dir)
|
||||
|
||||
trainer.push_to_hub(
|
||||
finetuned_from=config.pretrained_mlm_repo_name,
|
||||
dataset=config.dataset_repo_name,
|
||||
commit_message=f"Trained on {config.dataset_repo_name} using {config.tokenizer_repo_name}",
|
||||
)
|
||||
|
||||
|
||||
@click.command(help="Training script for the masked language model pretraining for the segmentation model given a segmentation json.")
|
||||
@click.argument("json_path", type=str)
|
||||
def main(json_path: str):
|
||||
json_file_path = pathlib.Path(json_path)
|
||||
segmentation_config = parse_segmentation_config_json(json_file_path)
|
||||
train_mlm(segmentation_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,155 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import click
|
||||
|
||||
import evaluate
|
||||
import numpy as np
|
||||
from datasets import ReadInstruction, load_dataset
|
||||
from huggingface_hub import hf_hub_download, repo_exists
|
||||
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
|
||||
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, PreTrainedTokenizerFast, Trainer, TrainingArguments
|
||||
|
||||
# two dictionaries, id2label and label2id, which contain the mappings from ID to label and vice versa.
|
||||
label_names = ["B", "I", "E"]
|
||||
id2label = {str(i): label for i, label in enumerate(label_names)}
|
||||
label2id = {v: k for k, v in id2label.items()}
|
||||
|
||||
|
||||
# compute_metrics: evaluate metric for training and evaluation.
|
||||
def compute_metrics(eval_preds):
|
||||
metric = evaluate.load("seqeval")
|
||||
logits, labels = eval_preds
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
|
||||
# Remove ignored index (special tokens) and convert to labels
|
||||
# noqa: E741
|
||||
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
|
||||
true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
|
||||
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
|
||||
return {
|
||||
"precision": all_metrics["overall_precision"],
|
||||
"recall": all_metrics["overall_recall"],
|
||||
"f1": all_metrics["overall_f1"],
|
||||
"accuracy": all_metrics["overall_accuracy"],
|
||||
}
|
||||
|
||||
|
||||
def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
|
||||
tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
|
||||
|
||||
tokenizer_file = hf_hub_download(
|
||||
repo_id=tokenizer_repo_name,
|
||||
filename="tokenizer.json",
|
||||
token=True,
|
||||
cache_dir=str(tokenizer_dir),
|
||||
)
|
||||
tokenizer = PreTrainedTokenizerFast(
|
||||
tokenizer_file=tokenizer_file,
|
||||
unk_token="[UNK]",
|
||||
pad_token="[PAD]",
|
||||
cls_token="[CLS]",
|
||||
sep_token="[SEP]",
|
||||
mask_token="[MASK]",
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def load_tokenized_train_and_valid_dataset(dataset_repo_name: str, cache_dir: pathlib.Path, dataset_percentage: int = 100):
|
||||
dataset_dir = cache_dir / "datasets" / dataset_repo_name
|
||||
# Load the tokenized dataset
|
||||
tokenized_train_dataset = load_dataset(
|
||||
dataset_repo_name,
|
||||
token=True,
|
||||
cache_dir=str(dataset_dir),
|
||||
split=ReadInstruction("train", to=dataset_percentage, unit="%"),
|
||||
)
|
||||
|
||||
tokenized_validation_dataset = load_dataset(
|
||||
dataset_repo_name,
|
||||
token=True,
|
||||
cache_dir=str(dataset_dir),
|
||||
split="valid",
|
||||
)
|
||||
|
||||
return tokenized_train_dataset, tokenized_validation_dataset
|
||||
|
||||
|
||||
def train_segmentation_model(config: SegmentationConfiguration):
|
||||
if repo_exists(config.base_repo_name):
|
||||
logging.error(f"{config.base_repo_name} has already exists")
|
||||
exit(1)
|
||||
# training arguments.
|
||||
training_args = TrainingArguments(
|
||||
output_dir=str(config.segmenter_dir),
|
||||
overwrite_output_dir=True,
|
||||
eval_strategy="epoch",
|
||||
logging_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
learning_rate=config.segmentation_training_parameters.learning_rate,
|
||||
num_train_epochs=config.segmentation_training_parameters.epochs,
|
||||
per_device_train_batch_size=config.segmentation_training_parameters.batch_size,
|
||||
save_steps=1000,
|
||||
weight_decay=0.01,
|
||||
fp16=True,
|
||||
push_to_hub=True,
|
||||
hub_model_id=config.segmenter_repo_name,
|
||||
hub_private_repo=True,
|
||||
ddp_backend="nccl",
|
||||
ddp_find_unused_parameters=True,
|
||||
save_total_limit=5,
|
||||
)
|
||||
|
||||
# load a basic pretrained BERT model
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
pretrained_model_name_or_path=config.mlm_repo_name,
|
||||
id2label=id2label,
|
||||
label2id=label2id,
|
||||
token=True,
|
||||
)
|
||||
|
||||
# Set DataCollator for DataCollatorForTokenClassification
|
||||
tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
|
||||
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, max_length=config.max_token_length)
|
||||
|
||||
(
|
||||
tokenized_train_dataset,
|
||||
tokenized_validation_dataset,
|
||||
) = load_tokenized_train_and_valid_dataset(config.tokenized_dataset_repo_name, config.cache_dir, config.dataset_percentage)
|
||||
|
||||
# Hugging face trainer: a Trainer class to fine-tune pretrained models
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
data_collator=data_collator,
|
||||
train_dataset=tokenized_train_dataset,
|
||||
eval_dataset=tokenized_validation_dataset,
|
||||
compute_metrics=compute_metrics,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
# Training
|
||||
trainer.train()
|
||||
|
||||
if int(os.environ["LOCAL_RANK"]) == 0:
|
||||
# Save the model
|
||||
trainer.save_model(str(config.segmenter_dir))
|
||||
|
||||
trainer.push_to_hub(
|
||||
finetuned_from=config.mlm_repo_name,
|
||||
dataset=config.tokenized_dataset_repo_name,
|
||||
commit_message=f"Trained on {config.tokenized_dataset_repo_name} using {config.mlm_repo_name}",
|
||||
)
|
||||
|
||||
|
||||
@click.command(help="Training script for the segmentation model given a segmentation json.")
|
||||
@click.argument("json_path", type=str)
|
||||
def main(json_path: str):
|
||||
json_file_path = pathlib.Path(json_path)
|
||||
segmentation_config = parse_segmentation_config_json(json_file_path)
|
||||
train_segmentation_model(segmentation_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,96 @@
|
||||
import logging
|
||||
import pathlib
|
||||
import click
|
||||
|
||||
from datasets import ReadInstruction, load_dataset
|
||||
from huggingface_hub import HfApi, create_repo, repo_exists
|
||||
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
|
||||
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors, trainers
|
||||
|
||||
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
|
||||
|
||||
|
||||
def get_untrained_tokenizer() -> Tokenizer:
|
||||
# WordPiece tokenization for BERT.
|
||||
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
|
||||
|
||||
# The normalizer recognizes the accented characters and strip them out.
|
||||
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])
|
||||
|
||||
# The pre-tokenizer splits on <SEP> tokens.
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Split("<SEP>", "removed")
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def post_training_configuration(tokenizer: Tokenizer):
|
||||
cls_token_id = tokenizer.token_to_id("[CLS]")
|
||||
sep_token_id = tokenizer.token_to_id("[SEP]")
|
||||
|
||||
# Set decoder for the tokenizer
|
||||
tokenizer.decoder = decoders.WordPiece(prefix="##")
|
||||
|
||||
# For the TemplateProcessor, we have to specify how to treat a single sentence and a pair of sentences.
|
||||
tokenizer.post_processor = processors.TemplateProcessing(
|
||||
single="[CLS]:0 $A:0 [SEP]:0",
|
||||
pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
|
||||
special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
|
||||
)
|
||||
|
||||
|
||||
def save_and_upload_tokenizer(
|
||||
tokenizer: Tokenizer,
|
||||
tokenizer_json_path: pathlib.Path,
|
||||
tokenizer_repo_name: str,
|
||||
dataset_name: str,
|
||||
):
|
||||
# save the tokenizer locally
|
||||
tokenizer_json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tokenizer.save(str(tokenizer_json_path.resolve()))
|
||||
|
||||
# upload tokenizer to huggingface
|
||||
api = HfApi()
|
||||
create_repo(tokenizer_repo_name, exist_ok=True, private=True)
|
||||
api.upload_file(
|
||||
path_in_repo="tokenizer.json",
|
||||
path_or_fileobj=str(tokenizer_json_path.resolve()),
|
||||
repo_id=tokenizer_repo_name,
|
||||
commit_message=f"Trained tokenizer using {dataset_name}",
|
||||
)
|
||||
|
||||
|
||||
def train_tokenizer(config: SegmentationConfiguration):
|
||||
if repo_exists(config.base_repo_name):
|
||||
logging.error(f"{config.base_repo_name} has already exists")
|
||||
exit(1)
|
||||
|
||||
tokenizer = get_untrained_tokenizer()
|
||||
|
||||
train_dataset = load_dataset(
|
||||
config.dataset_repo_name,
|
||||
token=True,
|
||||
split=ReadInstruction("train", to=config.dataset_percentage, unit="%"),
|
||||
)["bytecode"]
|
||||
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens)
|
||||
tokenizer.train_from_iterator(train_dataset, trainer=trainer)
|
||||
|
||||
post_training_configuration(tokenizer)
|
||||
|
||||
save_and_upload_tokenizer(
|
||||
tokenizer,
|
||||
config.tokenizer_json_path,
|
||||
config.tokenizer_repo_name,
|
||||
config.dataset_repo_name,
|
||||
)
|
||||
|
||||
|
||||
@click.command(help="Training script for the bytecode tokenizer for the segmentation model given a segmentation json.")
|
||||
@click.argument("json_path", type=str)
|
||||
def main(json_path: str):
|
||||
json_file_path = pathlib.Path(json_path)
|
||||
segmentation_config = parse_segmentation_config_json(json_file_path)
|
||||
train_tokenizer(segmentation_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user