Merge pull request #86 from syssec-utd/uv-migration

Migrate from Poetry to UV
This commit is contained in:
Joel-Flores123
2025-09-29 18:44:50 -05:00
committed by GitHub
16 changed files with 1814 additions and 3175 deletions
+4 -4
View File
@@ -4,7 +4,7 @@ PyLingual's accuracy is dependent on having accurate segmentation and statement
## Dataset generation
First install [pyenv](https://github.com/pyenv/pyenv) and the required Python versions for the dataset. Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
The dataset directory should be structured like so, with only one `.py` file per directory:
@@ -24,7 +24,7 @@ dataset
The names of the inner directories and files do not matter. Then create the dataset:
```
python prepare_dataset.py <path to JSON>
uv run prepare_dataset.py <path to JSON>
```
## Segmentation model
@@ -32,7 +32,7 @@ python prepare_dataset.py <path to JSON>
Create a segmentation model JSON file based off the sample (`sample_jsons/py36-sample-segmentation.json`). Then train the model:
```
python train_models.py --segmentation <path to JSON>
uv run train_models.py --segmentation <path to JSON>
```
## Statement model
@@ -40,7 +40,7 @@ python train_models.py --segmentation <path to JSON>
Create a statement model JSON file based off the sample (`sample_jsons/py36-sample-statement.json`). Then train the model:
```
python train_models.py --statement <path to JSON>
uv run train_models.py --statement <path to JSON>
```
Once models are trained, update `../pylingual/decompiler_config.yaml` or create a separate config file by replacing the old models with the newly trained ones.
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../" }
# ///
import contextlib
import shutil
import json
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import csv
import itertools
import logging
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import itertools
import logging
import multiprocessing
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../" }
# ///
import json
import logging
import pathlib
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import ast
import functools
import os
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import logging
import os
import pathlib
+16 -4
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../" }
# ///
import logging
import os
import pathlib
@@ -12,12 +21,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
# train tokenizer
logger.info("training tokenizer...")
subprocess.run(["python", segmentation_root / "train_tokenizer.py", segmentation_config_path])
subprocess.run(["uv", "run", segmentation_root / "train_tokenizer.py", segmentation_config_path])
# train mlm (single gpu to avoid conflicts with local tokenized data)
logger.info("training masked language model...")
subprocess.run(
[
"uv", "run",
"torchrun",
f"--nnodes={nnodes}",
f"--nproc-per-node={nproc_per_node}",
@@ -31,12 +41,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
# tokenize dataset
logger.info("tokenizing segmentation dataset...")
subprocess.run(["python", segmentation_root / "tokenize_seg.py", segmentation_config_path])
subprocess.run(["uv", "run", segmentation_root / "tokenize_seg.py", segmentation_config_path])
# train segmentation model (4 gpus)
logger.info("training segmentation model...")
subprocess.run(
[
"uv", "run",
"torchrun",
f"--nnodes={nnodes}",
f"--nproc-per-node={nproc_per_node}",
@@ -53,16 +64,17 @@ def train_statement(statement_config_path: pathlib.Path, logger: logging.Logger,
statement_root = pathlib.Path(__file__).parent / "statement"
# manual tokenizer
subprocess.run(["python", statement_root / "train_tokenizer_auto.py", statement_config_path])
subprocess.run(["uv", "run", statement_root / "train_tokenizer_auto.py", statement_config_path])
# tokenize statement dataset with salesforce tokenizer
logger.info("tokenizing statement dataset...")
subprocess.run(["python", statement_root / "tokenize_seq2seq.py", statement_config_path])
subprocess.run(["uv", "run", statement_root / "tokenize_seq2seq.py", statement_config_path])
# train statement model (4 gpus)
logger.info("training statement model...")
subprocess.run(
[
"uv", "run",
"torchrun",
f"--nnodes={nnodes}",
f"--nproc-per-node={nproc_per_node}",