This commit is contained in:
caandt
2025-03-13 16:56:36 -05:00
parent b2439eee3e
commit 046e80cdd1
27 changed files with 0 additions and 0 deletions
@@ -0,0 +1,74 @@
import json
import logging
import pathlib
from dataclasses import dataclass
from typing import Optional
@dataclass
class TrainingParameters:
batch_size: int
epochs: int
learning_rate: float
@dataclass
class SegmentationConfiguration:
base_repo_name: str
dataset_repo_name: str
pretrained_mlm_repo_name: str
cache_dir: pathlib.Path
max_token_length: int
dataset_percentage: int
mlm_training_parameters: TrainingParameters
segmentation_training_parameters: TrainingParameters
@property
def tokenizer_repo_name(self):
return self.base_repo_name + "-tokenizer"
@property
def tokenizer_json_path(self):
return self.cache_dir / "tokenizers" / self.tokenizer_repo_name / "tokenizer.json"
@property
def tokenized_dataset_repo_name(self):
return self.dataset_repo_name + "-tokenized"
@property
def mlm_repo_name(self):
return self.base_repo_name + "-mlm"
@property
def mlm_dir(self):
return self.cache_dir / "models" / self.mlm_repo_name
@property
def segmenter_repo_name(self):
return self.base_repo_name + "-segmenter"
@property
def segmenter_dir(self):
return self.cache_dir / "models" / self.segmenter_repo_name
@property
def dataset_dir(self):
return self.cache_dir / "datasets" / self.dataset_repo_name
def __post_init__(self):
self.cache_dir = pathlib.Path(self.cache_dir)
def parse_segmentation_config_json(json_file_path: pathlib.Path, logger: Optional[logging.Logger] = None) -> SegmentationConfiguration:
if not json_file_path.exists():
raise FileNotFoundError(f"{json_file_path} does not exist")
if logger:
logger.info(f"Loading model description from {json_file_path}...")
with json_file_path.open() as json_file:
segmentation_config_dict = json.load(json_file)
segmentation_config_dict["mlm_training_parameters"] = TrainingParameters(**segmentation_config_dict["mlm_training_parameters"])
segmentation_config_dict["segmentation_training_parameters"] = TrainingParameters(**segmentation_config_dict["segmentation_training_parameters"])
return SegmentationConfiguration(**segmentation_config_dict)
+152
View File
@@ -0,0 +1,152 @@
import ast
import functools
import os
import pathlib
import click
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
from pylingual.segmentation.sliding_window import sliding_window
from transformers import PreTrainedTokenizerFast
bytecode_separator = " <SEP> "
def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
tokenizer_file = hf_hub_download(repo_id=tokenizer_repo_name, filename="tokenizer.json", token=True, cache_dir=str(tokenizer_dir))
tokenizer = PreTrainedTokenizerFast(
tokenizer_file=tokenizer_file,
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
)
return tokenizer
# we need to make sure we align all the labels with the proper words.
def align_labels_with_tokens(labels, word_ids):
label_names = ["B", "I", "E"]
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
# Start of a new word!
current_word = word_id
label = -100 if word_id is None else int(label2id[labels[word_id]])
new_labels.append(label)
elif word_id is None:
# Special token
new_labels.append(-100)
else:
# Same word as previous token
label = int(label2id[labels[word_id]])
new_labels.append(label)
return new_labels
# the process function used for tokenize the dataset
def tokenize_and_align_labels(tokenizer: PreTrainedTokenizerFast, max_length: int, examples):
MAX_WINDOW_LENGTH = 512
STEP_SIZE = 128
# parse the strings into lists to better work with the bytecode and boundaries
parsed_bc = [(codeobj.split(" <SEP> "), ast.literal_eval(bounds)) for codeobj, bounds in zip(examples["bytecode"], examples["boundary"])]
codeobj_tokens = []
# count the tokens for each bytecode instruction in a codeobj
for codeobj, bounds in parsed_bc:
token_list = []
for bc, bounds in zip(codeobj, bounds):
token_list.append(((bc, bounds), len(tokenizer(bc)[0])))
codeobj_tokens.append(token_list)
windows = [sliding_window(codeobj, MAX_WINDOW_LENGTH, STEP_SIZE) for codeobj in codeobj_tokens]
# remake examples using our windows
examples["boundary"] = []
examples["bytecode"] = []
# go through each window
for window in windows:
for item in window:
# where we will temporarily store our bytecode and bounds
bytecode = []
bounds = []
for bc in item[0]:
bytecode.append(bc[0])
bounds.append(bc[1])
# append it into examples
examples["bytecode"].append(bytecode_separator.join(bytecode))
examples["boundary"].append(str(bounds))
tokenized_inputs = tokenizer(
examples["bytecode"],
truncation=True,
max_length=max_length,
)
all_labels = examples["boundary"]
new_labels = []
for i, labels in enumerate(all_labels):
labels = labels.replace("'", "").strip("][").split(", ")
word_ids = tokenized_inputs.word_ids(i)
labels_len = len(labels)
max_word_id = word_ids[-2]
# for those data might cause error due to the incorrect tokenization, we fix the data exceed-length issue and
# leave them here as some noisy data.
if max_word_id >= labels_len:
new_labels.append([-100] * max_word_id)
else:
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
def tokenize_segmentation_dataset(config: SegmentationConfiguration):
raw_dataset = load_dataset(config.dataset_repo_name, token=True, cache_dir=str(config.dataset_dir))
tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
prepped_tokenize_and_align_labels = functools.partial(tokenize_and_align_labels, tokenizer, config.max_token_length)
# tokenize input dataset
column_names = raw_dataset["train"].column_names
tokenized_datasets = raw_dataset.map(
prepped_tokenize_and_align_labels,
batched=True,
remove_columns=column_names,
num_proc=os.cpu_count(),
desc="Tokenizing datasets",
)
tokenized_datasets.push_to_hub(
config.tokenized_dataset_repo_name,
private=True,
)
@click.command(help="Script to tokenize the segmentation dataset given a segmentation json.")
@click.argument("json_path", type=str)
def main(json_path: str):
json_file_path = pathlib.Path(json_path)
segmentation_config = parse_segmentation_config_json(json_file_path)
tokenize_segmentation_dataset(segmentation_config)
if __name__ == "__main__":
main()
+195
View File
@@ -0,0 +1,195 @@
import logging
import os
import pathlib
import click
from datasets import load_dataset
from huggingface_hub import hf_hub_download, repo_exists
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling, PreTrainedTokenizerFast, RobertaConfig, RobertaForMaskedLM, Trainer, TrainingArguments
from pylingual.segmentation.sliding_window import sliding_window
bytecode_separator = " <SEP> "
def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
tokenizer_file = hf_hub_download(
repo_id=tokenizer_repo_name,
filename="tokenizer.json",
token=True,
cache_dir=str(tokenizer_dir),
)
tokenizer = PreTrainedTokenizerFast(
tokenizer_file=tokenizer_file,
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
)
return tokenizer
def load_tokenized_train_dataset(
dataset_repo_name: str,
tokenizer: PreTrainedTokenizerFast,
max_length: int,
cache_dir: pathlib.Path,
):
dataset_dir = cache_dir / "datasets" / dataset_repo_name
raw_dataset = load_dataset(dataset_repo_name, token=True, cache_dir=dataset_dir, split="train")
# tokenize the input data
column_names = raw_dataset.column_names
def tokenize(examples):
# sliding window compatibility
MAX_WINDOW_LENGTH = 512
STEP_SIZE = 128
# parse the strings into lists to better work with the bytecode and boundaries
parsed_bc = [codeobj.split(" <SEP> ") for codeobj in examples["bytecode"]]
codeobj_tokens = []
# count the tokens for each bytecode instruction in a codeobj
for codeobj in parsed_bc:
token_list = []
for bytecode in codeobj:
token_list.append((bytecode, len(tokenizer(bytecode)[0])))
codeobj_tokens.append(token_list)
windows = [sliding_window(codeobj, MAX_WINDOW_LENGTH, STEP_SIZE) for codeobj in codeobj_tokens]
# remake examples using our windows
examples["bytecode"] = []
# go through each window
for window in windows:
for item in window:
# where we will temporarily store our bytecode and bounds
bytecode = []
for bc in item[0]:
bytecode.append(bc)
# append to examples
examples["bytecode"].append(bytecode_separator.join(bytecode))
return tokenizer(examples["bytecode"], max_length=max_length, truncation=True)
tokenized_dataset = raw_dataset.map(
tokenize,
batched=True,
remove_columns=column_names,
num_proc=os.cpu_count(),
desc="Tokenizing datasets",
)
return tokenized_dataset
def load_pretrained_mlm(
pretrained_mlm_repo_name: str,
tokenizer_embedding_length: int,
cache_dir: pathlib.Path,
) -> AutoModelForMaskedLM:
# load a basic pretrained BERT model
pretrained_mlm_dir = cache_dir / "models" / pretrained_mlm_repo_name
model = AutoModelForMaskedLM.from_pretrained(pretrained_mlm_repo_name, cache_dir=str(pretrained_mlm_dir))
# resize token embeddings to fit the model
model.resize_token_embeddings(tokenizer_embedding_length)
return model
def initialize_untrained_mlm(
tokenizer_embedding_length: int,
max_token_length: int,
) -> RobertaForMaskedLM:
# initialize untrained RoBERTa model
# most configuration options set to match https://huggingface.co/microsoft/codebert-base/blob/main/config.json for direct comparison
model_config = RobertaConfig(
max_position_embeddings=max_token_length, # INPUT LENGTH LIMIT
vocab_size=tokenizer_embedding_length,
layer_norm_eps=1e-05,
type_vocab_size=1,
)
model = RobertaForMaskedLM(model_config)
return model
def train_mlm(config: SegmentationConfiguration):
if repo_exists(config.base_repo_name):
logging.error(f"{config.base_repo_name} has already exists")
exit(1)
using_pretrained_model = bool(config.pretrained_mlm_repo_name)
# train model, for now the configuration comes from a regular T5 translation model.
training_args = TrainingArguments(
output_dir=str(config.mlm_dir),
num_train_epochs=config.mlm_training_parameters.epochs,
per_device_train_batch_size=config.mlm_training_parameters.batch_size,
save_steps=1000,
save_total_limit=5,
prediction_loss_only=True,
push_to_hub=True,
hub_model_id=config.mlm_repo_name,
hub_private_repo=True,
ddp_backend="nccl",
ddp_find_unused_parameters=using_pretrained_model, # only look for unused parameters in pretrained models
remove_unused_columns=False,
)
tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
# Set DataCollator for MLM task, set the probability of masking.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
if using_pretrained_model:
pretrained_mlm = load_pretrained_mlm(config.pretrained_mlm_repo_name, len(tokenizer), config.cache_dir)
else:
pretrained_mlm = initialize_untrained_mlm(len(tokenizer), config.max_token_length + 2)
tokenized_training_data = load_tokenized_train_dataset(config.dataset_repo_name, tokenizer, config.max_token_length, config.cache_dir)
# Hugging face trainer: a Trainer class to fine-tune pretrained models
trainer = Trainer(
model=pretrained_mlm,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_training_data,
)
# Training
trainer.train()
if int(os.environ["LOCAL_RANK"]) == 0:
# Save the model
trainer.save_model(config.mlm_dir)
trainer.push_to_hub(
finetuned_from=config.pretrained_mlm_repo_name,
dataset=config.dataset_repo_name,
commit_message=f"Trained on {config.dataset_repo_name} using {config.tokenizer_repo_name}",
)
@click.command(help="Training script for the masked language model pretraining for the segmentation model given a segmentation json.")
@click.argument("json_path", type=str)
def main(json_path: str):
json_file_path = pathlib.Path(json_path)
segmentation_config = parse_segmentation_config_json(json_file_path)
train_mlm(segmentation_config)
if __name__ == "__main__":
main()
+155
View File
@@ -0,0 +1,155 @@
import logging
import os
import pathlib
import click
import evaluate
import numpy as np
from datasets import ReadInstruction, load_dataset
from huggingface_hub import hf_hub_download, repo_exists
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, PreTrainedTokenizerFast, Trainer, TrainingArguments
# two dictionaries, id2label and label2id, which contain the mappings from ID to label and vice versa.
label_names = ["B", "I", "E"]
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
# compute_metrics: evaluate metric for training and evaluation.
def compute_metrics(eval_preds):
metric = evaluate.load("seqeval")
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
# Remove ignored index (special tokens) and convert to labels
# noqa: E741
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
tokenizer_file = hf_hub_download(
repo_id=tokenizer_repo_name,
filename="tokenizer.json",
token=True,
cache_dir=str(tokenizer_dir),
)
tokenizer = PreTrainedTokenizerFast(
tokenizer_file=tokenizer_file,
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
)
return tokenizer
def load_tokenized_train_and_valid_dataset(dataset_repo_name: str, cache_dir: pathlib.Path, dataset_percentage: int = 100):
dataset_dir = cache_dir / "datasets" / dataset_repo_name
# Load the tokenized dataset
tokenized_train_dataset = load_dataset(
dataset_repo_name,
token=True,
cache_dir=str(dataset_dir),
split=ReadInstruction("train", to=dataset_percentage, unit="%"),
)
tokenized_validation_dataset = load_dataset(
dataset_repo_name,
token=True,
cache_dir=str(dataset_dir),
split="valid",
)
return tokenized_train_dataset, tokenized_validation_dataset
def train_segmentation_model(config: SegmentationConfiguration):
if repo_exists(config.base_repo_name):
logging.error(f"{config.base_repo_name} has already exists")
exit(1)
# training arguments.
training_args = TrainingArguments(
output_dir=str(config.segmenter_dir),
overwrite_output_dir=True,
eval_strategy="epoch",
logging_strategy="epoch",
save_strategy="epoch",
learning_rate=config.segmentation_training_parameters.learning_rate,
num_train_epochs=config.segmentation_training_parameters.epochs,
per_device_train_batch_size=config.segmentation_training_parameters.batch_size,
save_steps=1000,
weight_decay=0.01,
fp16=True,
push_to_hub=True,
hub_model_id=config.segmenter_repo_name,
hub_private_repo=True,
ddp_backend="nccl",
ddp_find_unused_parameters=True,
save_total_limit=5,
)
# load a basic pretrained BERT model
model = AutoModelForTokenClassification.from_pretrained(
pretrained_model_name_or_path=config.mlm_repo_name,
id2label=id2label,
label2id=label2id,
token=True,
)
# Set DataCollator for DataCollatorForTokenClassification
tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, max_length=config.max_token_length)
(
tokenized_train_dataset,
tokenized_validation_dataset,
) = load_tokenized_train_and_valid_dataset(config.tokenized_dataset_repo_name, config.cache_dir, config.dataset_percentage)
# Hugging face trainer: a Trainer class to fine-tune pretrained models
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_validation_dataset,
compute_metrics=compute_metrics,
tokenizer=tokenizer,
)
# Training
trainer.train()
if int(os.environ["LOCAL_RANK"]) == 0:
# Save the model
trainer.save_model(str(config.segmenter_dir))
trainer.push_to_hub(
finetuned_from=config.mlm_repo_name,
dataset=config.tokenized_dataset_repo_name,
commit_message=f"Trained on {config.tokenized_dataset_repo_name} using {config.mlm_repo_name}",
)
@click.command(help="Training script for the segmentation model given a segmentation json.")
@click.argument("json_path", type=str)
def main(json_path: str):
json_file_path = pathlib.Path(json_path)
segmentation_config = parse_segmentation_config_json(json_file_path)
train_segmentation_model(segmentation_config)
if __name__ == "__main__":
main()
@@ -0,0 +1,96 @@
import logging
import pathlib
import click
from datasets import ReadInstruction, load_dataset
from huggingface_hub import HfApi, create_repo, repo_exists
from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors, trainers
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
def get_untrained_tokenizer() -> Tokenizer:
# WordPiece tokenization for BERT.
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
# The normalizer recognizes the accented characters and strip them out.
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])
# The pre-tokenizer splits on <SEP> tokens.
tokenizer.pre_tokenizer = pre_tokenizers.Split("<SEP>", "removed")
return tokenizer
def post_training_configuration(tokenizer: Tokenizer):
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
# Set decoder for the tokenizer
tokenizer.decoder = decoders.WordPiece(prefix="##")
# For the TemplateProcessor, we have to specify how to treat a single sentence and a pair of sentences.
tokenizer.post_processor = processors.TemplateProcessing(
single="[CLS]:0 $A:0 [SEP]:0",
pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)
def save_and_upload_tokenizer(
tokenizer: Tokenizer,
tokenizer_json_path: pathlib.Path,
tokenizer_repo_name: str,
dataset_name: str,
):
# save the tokenizer locally
tokenizer_json_path.parent.mkdir(parents=True, exist_ok=True)
tokenizer.save(str(tokenizer_json_path.resolve()))
# upload tokenizer to huggingface
api = HfApi()
create_repo(tokenizer_repo_name, exist_ok=True, private=True)
api.upload_file(
path_in_repo="tokenizer.json",
path_or_fileobj=str(tokenizer_json_path.resolve()),
repo_id=tokenizer_repo_name,
commit_message=f"Trained tokenizer using {dataset_name}",
)
def train_tokenizer(config: SegmentationConfiguration):
if repo_exists(config.base_repo_name):
logging.error(f"{config.base_repo_name} has already exists")
exit(1)
tokenizer = get_untrained_tokenizer()
train_dataset = load_dataset(
config.dataset_repo_name,
token=True,
split=ReadInstruction("train", to=config.dataset_percentage, unit="%"),
)["bytecode"]
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens)
tokenizer.train_from_iterator(train_dataset, trainer=trainer)
post_training_configuration(tokenizer)
save_and_upload_tokenizer(
tokenizer,
config.tokenizer_json_path,
config.tokenizer_repo_name,
config.dataset_repo_name,
)
@click.command(help="Training script for the bytecode tokenizer for the segmentation model given a segmentation json.")
@click.argument("json_path", type=str)
def main(json_path: str):
json_file_path = pathlib.Path(json_path)
segmentation_config = parse_segmentation_config_json(json_file_path)
train_tokenizer(segmentation_config)
if __name__ == "__main__":
main()