mirror of
https://github.com/syssec-utd/pylingual.git
synced 2026-05-10 18:39:03 -07:00
Merge pull request #86 from syssec-utd/uv-migration
Migrate from Poetry to UV
This commit is contained in:
@@ -4,7 +4,7 @@ PyLingual's accuracy is dependent on having accurate segmentation and statement
|
||||
|
||||
## Dataset generation
|
||||
|
||||
First install [pyenv](https://github.com/pyenv/pyenv) and the required Python versions for the dataset. Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
|
||||
Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
|
||||
|
||||
The dataset directory should be structured like so, with only one `.py` file per directory:
|
||||
|
||||
@@ -24,7 +24,7 @@ dataset
|
||||
The names of the inner directories and files do not matter. Then create the dataset:
|
||||
|
||||
```
|
||||
python prepare_dataset.py <path to JSON>
|
||||
uv run prepare_dataset.py <path to JSON>
|
||||
```
|
||||
|
||||
## Segmentation model
|
||||
@@ -32,7 +32,7 @@ python prepare_dataset.py <path to JSON>
|
||||
Create a segmentation model JSON file based off the sample (`sample_jsons/py36-sample-segmentation.json`). Then train the model:
|
||||
|
||||
```
|
||||
python train_models.py --segmentation <path to JSON>
|
||||
uv run train_models.py --segmentation <path to JSON>
|
||||
```
|
||||
|
||||
## Statement model
|
||||
@@ -40,7 +40,7 @@ python train_models.py --segmentation <path to JSON>
|
||||
Create a statement model JSON file based off the sample (`sample_jsons/py36-sample-statement.json`). Then train the model:
|
||||
|
||||
```
|
||||
python train_models.py --statement <path to JSON>
|
||||
uv run train_models.py --statement <path to JSON>
|
||||
```
|
||||
|
||||
Once models are trained, update `../pylingual/decompiler_config.yaml` or create a separate config file by replacing the old models with the newly trained ones.
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../" }
|
||||
# ///
|
||||
|
||||
import contextlib
|
||||
import shutil
|
||||
import json
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import csv
|
||||
import itertools
|
||||
import logging
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import multiprocessing
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../" }
|
||||
# ///
|
||||
|
||||
import json
|
||||
import logging
|
||||
import pathlib
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import ast
|
||||
import functools
|
||||
import os
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../" }
|
||||
# ///
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
@@ -12,12 +21,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
|
||||
|
||||
# train tokenizer
|
||||
logger.info("training tokenizer...")
|
||||
subprocess.run(["python", segmentation_root / "train_tokenizer.py", segmentation_config_path])
|
||||
subprocess.run(["uv", "run", segmentation_root / "train_tokenizer.py", segmentation_config_path])
|
||||
|
||||
# train mlm (single gpu to avoid conflicts with local tokenized data)
|
||||
logger.info("training masked language model...")
|
||||
subprocess.run(
|
||||
[
|
||||
"uv", "run",
|
||||
"torchrun",
|
||||
f"--nnodes={nnodes}",
|
||||
f"--nproc-per-node={nproc_per_node}",
|
||||
@@ -31,12 +41,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
|
||||
|
||||
# tokenize dataset
|
||||
logger.info("tokenizing segmentation dataset...")
|
||||
subprocess.run(["python", segmentation_root / "tokenize_seg.py", segmentation_config_path])
|
||||
subprocess.run(["uv", "run", segmentation_root / "tokenize_seg.py", segmentation_config_path])
|
||||
|
||||
# train segmentation model (4 gpus)
|
||||
logger.info("training segmentation model...")
|
||||
subprocess.run(
|
||||
[
|
||||
"uv", "run",
|
||||
"torchrun",
|
||||
f"--nnodes={nnodes}",
|
||||
f"--nproc-per-node={nproc_per_node}",
|
||||
@@ -53,16 +64,17 @@ def train_statement(statement_config_path: pathlib.Path, logger: logging.Logger,
|
||||
statement_root = pathlib.Path(__file__).parent / "statement"
|
||||
|
||||
# manual tokenizer
|
||||
subprocess.run(["python", statement_root / "train_tokenizer_auto.py", statement_config_path])
|
||||
subprocess.run(["uv", "run", statement_root / "train_tokenizer_auto.py", statement_config_path])
|
||||
|
||||
# tokenize statement dataset with salesforce tokenizer
|
||||
logger.info("tokenizing statement dataset...")
|
||||
subprocess.run(["python", statement_root / "tokenize_seq2seq.py", statement_config_path])
|
||||
subprocess.run(["uv", "run", statement_root / "tokenize_seq2seq.py", statement_config_path])
|
||||
|
||||
# train statement model (4 gpus)
|
||||
logger.info("training statement model...")
|
||||
subprocess.run(
|
||||
[
|
||||
"uv", "run",
|
||||
"torchrun",
|
||||
f"--nnodes={nnodes}",
|
||||
f"--nproc-per-node={nproc_per_node}",
|
||||
|
||||
Reference in New Issue
Block a user