rename

2026-05-10 18:39:03 -07:00 · 2025-03-13 16:56:36 -05:00
parent b2439eee3e
commit 046e80cdd1
27 changed files with 0 additions and 0 deletions
@@ -0,0 +1,74 @@
+import json
+import logging
+import pathlib
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class TrainingParameters:
+    batch_size: int
+    epochs: int
+    learning_rate: float
+
+
+@dataclass
+class SegmentationConfiguration:
+    base_repo_name: str
+    dataset_repo_name: str
+    pretrained_mlm_repo_name: str
+    cache_dir: pathlib.Path
+    max_token_length: int
+    dataset_percentage: int
+    mlm_training_parameters: TrainingParameters
+    segmentation_training_parameters: TrainingParameters
+
+    @property
+    def tokenizer_repo_name(self):
+        return self.base_repo_name + "-tokenizer"
+
+    @property
+    def tokenizer_json_path(self):
+        return self.cache_dir / "tokenizers" / self.tokenizer_repo_name / "tokenizer.json"
+
+    @property
+    def tokenized_dataset_repo_name(self):
+        return self.dataset_repo_name + "-tokenized"
+
+    @property
+    def mlm_repo_name(self):
+        return self.base_repo_name + "-mlm"
+
+    @property
+    def mlm_dir(self):
+        return self.cache_dir / "models" / self.mlm_repo_name
+
+    @property
+    def segmenter_repo_name(self):
+        return self.base_repo_name + "-segmenter"
+
+    @property
+    def segmenter_dir(self):
+        return self.cache_dir / "models" / self.segmenter_repo_name
+
+    @property
+    def dataset_dir(self):
+        return self.cache_dir / "datasets" / self.dataset_repo_name
+
+    def __post_init__(self):
+        self.cache_dir = pathlib.Path(self.cache_dir)
+
+
+def parse_segmentation_config_json(json_file_path: pathlib.Path, logger: Optional[logging.Logger] = None) -> SegmentationConfiguration:
+    if not json_file_path.exists():
+        raise FileNotFoundError(f"{json_file_path} does not exist")
+
+    if logger:
+        logger.info(f"Loading model description from {json_file_path}...")
+
+    with json_file_path.open() as json_file:
+        segmentation_config_dict = json.load(json_file)
+
+    segmentation_config_dict["mlm_training_parameters"] = TrainingParameters(**segmentation_config_dict["mlm_training_parameters"])
+    segmentation_config_dict["segmentation_training_parameters"] = TrainingParameters(**segmentation_config_dict["segmentation_training_parameters"])
+    return SegmentationConfiguration(**segmentation_config_dict)
@@ -0,0 +1,152 @@
+import ast
+import functools
+import os
+import pathlib
+import click
+
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download
+from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
+from pylingual.segmentation.sliding_window import sliding_window
+from transformers import PreTrainedTokenizerFast
+
+bytecode_separator = " <SEP> "
+
+
+def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
+    tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
+
+    tokenizer_file = hf_hub_download(repo_id=tokenizer_repo_name, filename="tokenizer.json", token=True, cache_dir=str(tokenizer_dir))
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_file=tokenizer_file,
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+    )
+
+    return tokenizer
+
+
+# we need to make sure we align all the labels with the proper words.
+def align_labels_with_tokens(labels, word_ids):
+    label_names = ["B", "I", "E"]
+    id2label = {str(i): label for i, label in enumerate(label_names)}
+    label2id = {v: k for k, v in id2label.items()}
+
+    new_labels = []
+    current_word = None
+    for word_id in word_ids:
+        if word_id != current_word:
+            # Start of a new word!
+            current_word = word_id
+            label = -100 if word_id is None else int(label2id[labels[word_id]])
+            new_labels.append(label)
+        elif word_id is None:
+            # Special token
+            new_labels.append(-100)
+        else:
+            # Same word as previous token
+            label = int(label2id[labels[word_id]])
+            new_labels.append(label)
+    return new_labels
+
+
+# the process function used for tokenize the dataset
+def tokenize_and_align_labels(tokenizer: PreTrainedTokenizerFast, max_length: int, examples):
+    MAX_WINDOW_LENGTH = 512
+    STEP_SIZE = 128
+
+    # parse the strings into lists to better work with the bytecode and boundaries
+    parsed_bc = [(codeobj.split(" <SEP> "), ast.literal_eval(bounds)) for codeobj, bounds in zip(examples["bytecode"], examples["boundary"])]
+
+    codeobj_tokens = []
+
+    # count the tokens for each bytecode instruction in a codeobj
+    for codeobj, bounds in parsed_bc:
+        token_list = []
+
+        for bc, bounds in zip(codeobj, bounds):
+            token_list.append(((bc, bounds), len(tokenizer(bc)[0])))
+
+        codeobj_tokens.append(token_list)
+
+    windows = [sliding_window(codeobj, MAX_WINDOW_LENGTH, STEP_SIZE) for codeobj in codeobj_tokens]
+
+    # remake examples using our windows
+    examples["boundary"] = []
+    examples["bytecode"] = []
+
+    # go through each window
+    for window in windows:
+        for item in window:
+            # where we will temporarily store our bytecode and bounds
+            bytecode = []
+            bounds = []
+
+            for bc in item[0]:
+                bytecode.append(bc[0])
+                bounds.append(bc[1])
+
+            # append it into examples
+            examples["bytecode"].append(bytecode_separator.join(bytecode))
+            examples["boundary"].append(str(bounds))
+
+    tokenized_inputs = tokenizer(
+        examples["bytecode"],
+        truncation=True,
+        max_length=max_length,
+    )
+
+    all_labels = examples["boundary"]
+    new_labels = []
+    for i, labels in enumerate(all_labels):
+        labels = labels.replace("'", "").strip("][").split(", ")
+        word_ids = tokenized_inputs.word_ids(i)
+        labels_len = len(labels)
+        max_word_id = word_ids[-2]
+        # for those data might cause error due to the incorrect tokenization, we fix the data exceed-length issue and
+        # leave them here as some noisy data.
+        if max_word_id >= labels_len:
+            new_labels.append([-100] * max_word_id)
+        else:
+            new_labels.append(align_labels_with_tokens(labels, word_ids))
+
+    tokenized_inputs["labels"] = new_labels
+
+    return tokenized_inputs
+
+
+def tokenize_segmentation_dataset(config: SegmentationConfiguration):
+    raw_dataset = load_dataset(config.dataset_repo_name, token=True, cache_dir=str(config.dataset_dir))
+
+    tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
+    prepped_tokenize_and_align_labels = functools.partial(tokenize_and_align_labels, tokenizer, config.max_token_length)
+
+    # tokenize input dataset
+    column_names = raw_dataset["train"].column_names
+    tokenized_datasets = raw_dataset.map(
+        prepped_tokenize_and_align_labels,
+        batched=True,
+        remove_columns=column_names,
+        num_proc=os.cpu_count(),
+        desc="Tokenizing datasets",
+    )
+
+    tokenized_datasets.push_to_hub(
+        config.tokenized_dataset_repo_name,
+        private=True,
+    )
+
+
+@click.command(help="Script to tokenize the segmentation dataset given a segmentation json.")
+@click.argument("json_path", type=str)
+def main(json_path: str):
+    json_file_path = pathlib.Path(json_path)
+    segmentation_config = parse_segmentation_config_json(json_file_path)
+    tokenize_segmentation_dataset(segmentation_config)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,195 @@
+import logging
+import os
+import pathlib
+import click
+
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download, repo_exists
+from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
+from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling, PreTrainedTokenizerFast, RobertaConfig, RobertaForMaskedLM, Trainer, TrainingArguments
+
+from pylingual.segmentation.sliding_window import sliding_window
+
+bytecode_separator = " <SEP> "
+
+
+def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
+    tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
+
+    tokenizer_file = hf_hub_download(
+        repo_id=tokenizer_repo_name,
+        filename="tokenizer.json",
+        token=True,
+        cache_dir=str(tokenizer_dir),
+    )
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_file=tokenizer_file,
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+    )
+
+    return tokenizer
+
+
+def load_tokenized_train_dataset(
+    dataset_repo_name: str,
+    tokenizer: PreTrainedTokenizerFast,
+    max_length: int,
+    cache_dir: pathlib.Path,
+):
+    dataset_dir = cache_dir / "datasets" / dataset_repo_name
+    raw_dataset = load_dataset(dataset_repo_name, token=True, cache_dir=dataset_dir, split="train")
+
+    # tokenize the input data
+    column_names = raw_dataset.column_names
+
+    def tokenize(examples):
+        # sliding window compatibility
+        MAX_WINDOW_LENGTH = 512
+        STEP_SIZE = 128
+
+        # parse the strings into lists to better work with the bytecode and boundaries
+        parsed_bc = [codeobj.split(" <SEP> ") for codeobj in examples["bytecode"]]
+
+        codeobj_tokens = []
+
+        # count the tokens for each bytecode instruction in a codeobj
+        for codeobj in parsed_bc:
+            token_list = []
+
+            for bytecode in codeobj:
+                token_list.append((bytecode, len(tokenizer(bytecode)[0])))
+
+            codeobj_tokens.append(token_list)
+
+        windows = [sliding_window(codeobj, MAX_WINDOW_LENGTH, STEP_SIZE) for codeobj in codeobj_tokens]
+
+        # remake examples using our windows
+        examples["bytecode"] = []
+
+        # go through each window
+        for window in windows:
+            for item in window:
+                # where we will temporarily store our bytecode and bounds
+                bytecode = []
+
+                for bc in item[0]:
+                    bytecode.append(bc)
+
+                # append to examples
+                examples["bytecode"].append(bytecode_separator.join(bytecode))
+
+        return tokenizer(examples["bytecode"], max_length=max_length, truncation=True)
+
+    tokenized_dataset = raw_dataset.map(
+        tokenize,
+        batched=True,
+        remove_columns=column_names,
+        num_proc=os.cpu_count(),
+        desc="Tokenizing datasets",
+    )
+
+    return tokenized_dataset
+
+
+def load_pretrained_mlm(
+    pretrained_mlm_repo_name: str,
+    tokenizer_embedding_length: int,
+    cache_dir: pathlib.Path,
+) -> AutoModelForMaskedLM:
+    # load a basic pretrained BERT model
+    pretrained_mlm_dir = cache_dir / "models" / pretrained_mlm_repo_name
+    model = AutoModelForMaskedLM.from_pretrained(pretrained_mlm_repo_name, cache_dir=str(pretrained_mlm_dir))
+
+    # resize token embeddings to fit the model
+    model.resize_token_embeddings(tokenizer_embedding_length)
+
+    return model
+
+
+def initialize_untrained_mlm(
+    tokenizer_embedding_length: int,
+    max_token_length: int,
+) -> RobertaForMaskedLM:
+    # initialize untrained RoBERTa model
+    # most configuration options set to match https://huggingface.co/microsoft/codebert-base/blob/main/config.json for direct comparison
+    model_config = RobertaConfig(
+        max_position_embeddings=max_token_length,  # INPUT LENGTH LIMIT
+        vocab_size=tokenizer_embedding_length,
+        layer_norm_eps=1e-05,
+        type_vocab_size=1,
+    )
+    model = RobertaForMaskedLM(model_config)
+
+    return model
+
+
+def train_mlm(config: SegmentationConfiguration):
+    if repo_exists(config.base_repo_name):
+        logging.error(f"{config.base_repo_name} has already exists")
+        exit(1)
+
+    using_pretrained_model = bool(config.pretrained_mlm_repo_name)
+    # train model, for now the configuration comes from a regular T5 translation model.
+    training_args = TrainingArguments(
+        output_dir=str(config.mlm_dir),
+        num_train_epochs=config.mlm_training_parameters.epochs,
+        per_device_train_batch_size=config.mlm_training_parameters.batch_size,
+        save_steps=1000,
+        save_total_limit=5,
+        prediction_loss_only=True,
+        push_to_hub=True,
+        hub_model_id=config.mlm_repo_name,
+        hub_private_repo=True,
+        ddp_backend="nccl",
+        ddp_find_unused_parameters=using_pretrained_model,  # only look for unused parameters in pretrained models
+        remove_unused_columns=False,
+    )
+
+    tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
+
+    # Set DataCollator for MLM task, set the probability of masking.
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
+
+    if using_pretrained_model:
+        pretrained_mlm = load_pretrained_mlm(config.pretrained_mlm_repo_name, len(tokenizer), config.cache_dir)
+    else:
+        pretrained_mlm = initialize_untrained_mlm(len(tokenizer), config.max_token_length + 2)
+
+    tokenized_training_data = load_tokenized_train_dataset(config.dataset_repo_name, tokenizer, config.max_token_length, config.cache_dir)
+
+    # Hugging face trainer: a Trainer class to fine-tune pretrained models
+    trainer = Trainer(
+        model=pretrained_mlm,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=tokenized_training_data,
+    )
+
+    # Training
+    trainer.train()
+
+    if int(os.environ["LOCAL_RANK"]) == 0:
+        # Save the model
+        trainer.save_model(config.mlm_dir)
+
+        trainer.push_to_hub(
+            finetuned_from=config.pretrained_mlm_repo_name,
+            dataset=config.dataset_repo_name,
+            commit_message=f"Trained on {config.dataset_repo_name} using {config.tokenizer_repo_name}",
+        )
+
+
+@click.command(help="Training script for the masked language model pretraining for the segmentation model given a segmentation json.")
+@click.argument("json_path", type=str)
+def main(json_path: str):
+    json_file_path = pathlib.Path(json_path)
+    segmentation_config = parse_segmentation_config_json(json_file_path)
+    train_mlm(segmentation_config)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,155 @@
+import logging
+import os
+import pathlib
+import click
+
+import evaluate
+import numpy as np
+from datasets import ReadInstruction, load_dataset
+from huggingface_hub import hf_hub_download, repo_exists
+from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
+from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, PreTrainedTokenizerFast, Trainer, TrainingArguments
+
+# two dictionaries, id2label and label2id, which contain the mappings from ID to label and vice versa.
+label_names = ["B", "I", "E"]
+id2label = {str(i): label for i, label in enumerate(label_names)}
+label2id = {v: k for k, v in id2label.items()}
+
+
+# compute_metrics: evaluate metric for training and evaluation.
+def compute_metrics(eval_preds):
+    metric = evaluate.load("seqeval")
+    logits, labels = eval_preds
+    predictions = np.argmax(logits, axis=-1)
+
+    # Remove ignored index (special tokens) and convert to labels
+    # noqa: E741
+    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
+    true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
+    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
+    return {
+        "precision": all_metrics["overall_precision"],
+        "recall": all_metrics["overall_recall"],
+        "f1": all_metrics["overall_f1"],
+        "accuracy": all_metrics["overall_accuracy"],
+    }
+
+
+def load_tokenizer(tokenizer_repo_name: str, cache_dir: pathlib.Path) -> PreTrainedTokenizerFast:
+    tokenizer_dir = cache_dir / "tokenizers" / tokenizer_repo_name
+
+    tokenizer_file = hf_hub_download(
+        repo_id=tokenizer_repo_name,
+        filename="tokenizer.json",
+        token=True,
+        cache_dir=str(tokenizer_dir),
+    )
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_file=tokenizer_file,
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+    )
+
+    return tokenizer
+
+
+def load_tokenized_train_and_valid_dataset(dataset_repo_name: str, cache_dir: pathlib.Path, dataset_percentage: int = 100):
+    dataset_dir = cache_dir / "datasets" / dataset_repo_name
+    # Load the tokenized dataset
+    tokenized_train_dataset = load_dataset(
+        dataset_repo_name,
+        token=True,
+        cache_dir=str(dataset_dir),
+        split=ReadInstruction("train", to=dataset_percentage, unit="%"),
+    )
+
+    tokenized_validation_dataset = load_dataset(
+        dataset_repo_name,
+        token=True,
+        cache_dir=str(dataset_dir),
+        split="valid",
+    )
+
+    return tokenized_train_dataset, tokenized_validation_dataset
+
+
+def train_segmentation_model(config: SegmentationConfiguration):
+    if repo_exists(config.base_repo_name):
+        logging.error(f"{config.base_repo_name} has already exists")
+        exit(1)
+    # training arguments.
+    training_args = TrainingArguments(
+        output_dir=str(config.segmenter_dir),
+        overwrite_output_dir=True,
+        eval_strategy="epoch",
+        logging_strategy="epoch",
+        save_strategy="epoch",
+        learning_rate=config.segmentation_training_parameters.learning_rate,
+        num_train_epochs=config.segmentation_training_parameters.epochs,
+        per_device_train_batch_size=config.segmentation_training_parameters.batch_size,
+        save_steps=1000,
+        weight_decay=0.01,
+        fp16=True,
+        push_to_hub=True,
+        hub_model_id=config.segmenter_repo_name,
+        hub_private_repo=True,
+        ddp_backend="nccl",
+        ddp_find_unused_parameters=True,
+        save_total_limit=5,
+    )
+
+    # load a basic pretrained BERT model
+    model = AutoModelForTokenClassification.from_pretrained(
+        pretrained_model_name_or_path=config.mlm_repo_name,
+        id2label=id2label,
+        label2id=label2id,
+        token=True,
+    )
+
+    # Set DataCollator for DataCollatorForTokenClassification
+    tokenizer = load_tokenizer(config.tokenizer_repo_name, config.cache_dir)
+    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, max_length=config.max_token_length)
+
+    (
+        tokenized_train_dataset,
+        tokenized_validation_dataset,
+    ) = load_tokenized_train_and_valid_dataset(config.tokenized_dataset_repo_name, config.cache_dir, config.dataset_percentage)
+
+    # Hugging face trainer: a Trainer class to fine-tune pretrained models
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=tokenized_train_dataset,
+        eval_dataset=tokenized_validation_dataset,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+    )
+
+    # Training
+    trainer.train()
+
+    if int(os.environ["LOCAL_RANK"]) == 0:
+        # Save the model
+        trainer.save_model(str(config.segmenter_dir))
+
+        trainer.push_to_hub(
+            finetuned_from=config.mlm_repo_name,
+            dataset=config.tokenized_dataset_repo_name,
+            commit_message=f"Trained on {config.tokenized_dataset_repo_name} using {config.mlm_repo_name}",
+        )
+
+
+@click.command(help="Training script for the segmentation model given a segmentation json.")
+@click.argument("json_path", type=str)
+def main(json_path: str):
+    json_file_path = pathlib.Path(json_path)
+    segmentation_config = parse_segmentation_config_json(json_file_path)
+    train_segmentation_model(segmentation_config)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,96 @@
+import logging
+import pathlib
+import click
+
+from datasets import ReadInstruction, load_dataset
+from huggingface_hub import HfApi, create_repo, repo_exists
+from SegmentationConfiguration import SegmentationConfiguration, parse_segmentation_config_json
+from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors, trainers
+
+special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
+
+
+def get_untrained_tokenizer() -> Tokenizer:
+    # WordPiece tokenization for BERT.
+    tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
+
+    # The normalizer recognizes the accented characters and strip them out.
+    tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])
+
+    # The pre-tokenizer splits on <SEP> tokens.
+    tokenizer.pre_tokenizer = pre_tokenizers.Split("<SEP>", "removed")
+
+    return tokenizer
+
+
+def post_training_configuration(tokenizer: Tokenizer):
+    cls_token_id = tokenizer.token_to_id("[CLS]")
+    sep_token_id = tokenizer.token_to_id("[SEP]")
+
+    # Set decoder for the tokenizer
+    tokenizer.decoder = decoders.WordPiece(prefix="##")
+
+    # For the TemplateProcessor, we have to specify how to treat a single sentence and a pair of sentences.
+    tokenizer.post_processor = processors.TemplateProcessing(
+        single="[CLS]:0 $A:0 [SEP]:0",
+        pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
+        special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
+    )
+
+
+def save_and_upload_tokenizer(
+    tokenizer: Tokenizer,
+    tokenizer_json_path: pathlib.Path,
+    tokenizer_repo_name: str,
+    dataset_name: str,
+):
+    # save the tokenizer locally
+    tokenizer_json_path.parent.mkdir(parents=True, exist_ok=True)
+    tokenizer.save(str(tokenizer_json_path.resolve()))
+
+    # upload tokenizer to huggingface
+    api = HfApi()
+    create_repo(tokenizer_repo_name, exist_ok=True, private=True)
+    api.upload_file(
+        path_in_repo="tokenizer.json",
+        path_or_fileobj=str(tokenizer_json_path.resolve()),
+        repo_id=tokenizer_repo_name,
+        commit_message=f"Trained tokenizer using {dataset_name}",
+    )
+
+
+def train_tokenizer(config: SegmentationConfiguration):
+    if repo_exists(config.base_repo_name):
+        logging.error(f"{config.base_repo_name} has already exists")
+        exit(1)
+
+    tokenizer = get_untrained_tokenizer()
+
+    train_dataset = load_dataset(
+        config.dataset_repo_name,
+        token=True,
+        split=ReadInstruction("train", to=config.dataset_percentage, unit="%"),
+    )["bytecode"]
+    trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens)
+    tokenizer.train_from_iterator(train_dataset, trainer=trainer)
+
+    post_training_configuration(tokenizer)
+
+    save_and_upload_tokenizer(
+        tokenizer,
+        config.tokenizer_json_path,
+        config.tokenizer_repo_name,
+        config.dataset_repo_name,
+    )
+
+
+@click.command(help="Training script for the bytecode tokenizer for the segmentation model given a segmentation json.")
+@click.argument("json_path", type=str)
+def main(json_path: str):
+    json_file_path = pathlib.Path(json_path)
+    segmentation_config = parse_segmentation_config_json(json_file_path)
+    train_tokenizer(segmentation_config)
+
+
+if __name__ == "__main__":
+    main()