Merge pull request #86 from syssec-utd/uv-migration

Migrate from Poetry to UV
2026-05-10 18:39:03 -07:00 · 2025-09-29 18:44:50 -05:00
parent 335af894a1 5970a0d68d
commit ab262d085b
16 changed files with 1814 additions and 3175 deletions
@@ -4,7 +4,7 @@ PyLingual's accuracy is dependent on having accurate segmentation and statement

 ## Dataset generation

-First install [pyenv](https://github.com/pyenv/pyenv) and the required Python versions for the dataset. Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
+Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).

 The dataset directory should be structured like so, with only one `.py` file per directory:

@@ -24,7 +24,7 @@ dataset
 The names of the inner directories and files do not matter. Then create the dataset:

 ```
-python prepare_dataset.py <path to JSON>
+uv run prepare_dataset.py <path to JSON>
 ```

 ## Segmentation model
@@ -32,7 +32,7 @@ python prepare_dataset.py <path to JSON>
 Create a segmentation model JSON file based off the sample (`sample_jsons/py36-sample-segmentation.json`). Then train the model:

 ```
-python train_models.py --segmentation <path to JSON>
+uv run train_models.py --segmentation <path to JSON>
 ```

 ## Statement model
@@ -40,7 +40,7 @@ python train_models.py --segmentation <path to JSON>
 Create a statement model JSON file based off the sample (`sample_jsons/py36-sample-statement.json`). Then train the model:

 ```
-python train_models.py --statement <path to JSON>
+uv run train_models.py --statement <path to JSON>
 ```

 Once models are trained, update `../pylingual/decompiler_config.yaml` or create a separate config file by replacing the old models with the newly trained ones.
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../" }
+# ///
+
 import contextlib
 import shutil
 import json
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import csv
 import itertools
 import logging
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import itertools
 import logging
 import multiprocessing
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../" }
+# ///
+
 import json
 import logging
 import pathlib
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import ast
 import functools
 import os
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import logging
 import os
 import pathlib
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../" }
+# ///
+
 import logging
 import os
 import pathlib
@@ -12,12 +21,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L

    # train tokenizer
    logger.info("training tokenizer...")
-    subprocess.run(["python", segmentation_root / "train_tokenizer.py", segmentation_config_path])
+    subprocess.run(["uv", "run", segmentation_root / "train_tokenizer.py", segmentation_config_path])

    # train mlm (single gpu to avoid conflicts with local tokenized data)
    logger.info("training masked language model...")
    subprocess.run(
        [
+            "uv", "run",
            "torchrun",
            f"--nnodes={nnodes}",
            f"--nproc-per-node={nproc_per_node}",
@@ -31,12 +41,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L

    # tokenize dataset
    logger.info("tokenizing segmentation dataset...")
-    subprocess.run(["python", segmentation_root / "tokenize_seg.py", segmentation_config_path])
+    subprocess.run(["uv", "run", segmentation_root / "tokenize_seg.py", segmentation_config_path])

    # train segmentation model (4 gpus)
    logger.info("training segmentation model...")
    subprocess.run(
        [
+            "uv", "run",
            "torchrun",
            f"--nnodes={nnodes}",
            f"--nproc-per-node={nproc_per_node}",
@@ -53,16 +64,17 @@ def train_statement(statement_config_path: pathlib.Path, logger: logging.Logger,
    statement_root = pathlib.Path(__file__).parent / "statement"

    # manual tokenizer
-    subprocess.run(["python", statement_root / "train_tokenizer_auto.py", statement_config_path])
+    subprocess.run(["uv", "run", statement_root / "train_tokenizer_auto.py", statement_config_path])

    # tokenize statement dataset with salesforce tokenizer
    logger.info("tokenizing statement dataset...")
-    subprocess.run(["python", statement_root / "tokenize_seq2seq.py", statement_config_path])
+    subprocess.run(["uv", "run", statement_root / "tokenize_seq2seq.py", statement_config_path])

    # train statement model (4 gpus)
    logger.info("training statement model...")
    subprocess.run(
        [
+            "uv", "run",
            "torchrun",
            f"--nnodes={nnodes}",
            f"--nproc-per-node={nproc_per_node}",