Merge pull request #86 from syssec-utd/uv-migration

Migrate from Poetry to UV
2026-05-10 18:39:03 -07:00 · 2025-09-29 18:44:50 -05:00
parent 335af894a1 5970a0d68d
commit ab262d085b
16 changed files with 1814 additions and 3175 deletions
@@ -1,5 +1,6 @@
 dataset/
 venv/
+.venv/
 *.pyc
 .idea/
 .python-version
@@ -8,26 +8,21 @@ This codebase is optimized for readability and future extension, so there may in

 ## Requirements

- Python 3.12
+- `uv` Python package manager ([installation](https://docs.astral.sh/uv/getting-started/installation/)), used for project dependencies and managed Python versions.

-### Compiling bytecode
+### Decompiling End-Of-Life Python Versions

-Some parts of PyLingual require the ability to compile bytecode in a different Python version (equivalence check and model training). For this, you will need the following:
+To verify decompilation correctness and produce model training sets, PyLingual requires the ability to compile Python in the target version.

- [pyenv](https://github.com/pyenv/pyenv) with all Python versions you want to compile to
- [pyenv-win](https://github.com/pyenv-win/pyenv-win) for Windows
+For current Python versions (3.8-3.13), PyLingual uses `uv`'s managed installations, but for Python 3.6 and 3.7, PyLingual uses [pyenv](https://github.com/pyenv/pyenv) ([pyenv-win](https://github.com/pyenv-win/pyenv-win) for Windows).

 ## Setup

-Install from source, using [Poetry](https://python-poetry.org/):
+Install from source, using [uv](https://docs.astral.sh/uv/):

 ```sh
 git clone https://github.com/syssec-utd/pylingual
-cd pylingual
-python -m venv venv
-source venv/bin/activate
-pip install poetry>=2.0
-poetry install
+uv tool install ./pylingual
 ```

 ## Usage
@@ -4,7 +4,7 @@ PyLingual's accuracy is dependent on having accurate segmentation and statement

 ## Dataset generation

-First install [pyenv](https://github.com/pyenv/pyenv) and the required Python versions for the dataset. Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
+Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).

 The dataset directory should be structured like so, with only one `.py` file per directory:

@@ -24,7 +24,7 @@ dataset
 The names of the inner directories and files do not matter. Then create the dataset:

 ```
-python prepare_dataset.py <path to JSON>
+uv run prepare_dataset.py <path to JSON>
 ```

 ## Segmentation model
@@ -32,7 +32,7 @@ python prepare_dataset.py <path to JSON>
 Create a segmentation model JSON file based off the sample (`sample_jsons/py36-sample-segmentation.json`). Then train the model:

 ```
-python train_models.py --segmentation <path to JSON>
+uv run train_models.py --segmentation <path to JSON>
 ```

 ## Statement model
@@ -40,7 +40,7 @@ python train_models.py --segmentation <path to JSON>
 Create a statement model JSON file based off the sample (`sample_jsons/py36-sample-statement.json`). Then train the model:

 ```
-python train_models.py --statement <path to JSON>
+uv run train_models.py --statement <path to JSON>
 ```

 Once models are trained, update `../pylingual/decompiler_config.yaml` or create a separate config file by replacing the old models with the newly trained ones.
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../" }
+# ///
+
 import contextlib
 import shutil
 import json
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import csv
 import itertools
 import logging
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import itertools
 import logging
 import multiprocessing
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../" }
+# ///
+
 import json
 import logging
 import pathlib
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import ast
 import functools
 import os
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../../" }
+# ///
+
 import logging
 import os
 import pathlib
@@ -1,3 +1,12 @@
+# /// script
+# requires-python = ">= 3.12"
+# dependencies = [
+#   "pylingual",
+# ]
+# [tool.uv.sources]
+# pylingual = { path = "../" }
+# ///
+
 import logging
 import os
 import pathlib
@@ -12,12 +21,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L

    # train tokenizer
    logger.info("training tokenizer...")
-    subprocess.run(["python", segmentation_root / "train_tokenizer.py", segmentation_config_path])
+    subprocess.run(["uv", "run", segmentation_root / "train_tokenizer.py", segmentation_config_path])

    # train mlm (single gpu to avoid conflicts with local tokenized data)
    logger.info("training masked language model...")
    subprocess.run(
        [
+            "uv", "run",
            "torchrun",
            f"--nnodes={nnodes}",
            f"--nproc-per-node={nproc_per_node}",
@@ -31,12 +41,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L

    # tokenize dataset
    logger.info("tokenizing segmentation dataset...")
-    subprocess.run(["python", segmentation_root / "tokenize_seg.py", segmentation_config_path])
+    subprocess.run(["uv", "run", segmentation_root / "tokenize_seg.py", segmentation_config_path])

    # train segmentation model (4 gpus)
    logger.info("training segmentation model...")
    subprocess.run(
        [
+            "uv", "run",
            "torchrun",
            f"--nnodes={nnodes}",
            f"--nproc-per-node={nproc_per_node}",
@@ -53,16 +64,17 @@ def train_statement(statement_config_path: pathlib.Path, logger: logging.Logger,
    statement_root = pathlib.Path(__file__).parent / "statement"

    # manual tokenizer
-    subprocess.run(["python", statement_root / "train_tokenizer_auto.py", statement_config_path])
+    subprocess.run(["uv", "run", statement_root / "train_tokenizer_auto.py", statement_config_path])

    # tokenize statement dataset with salesforce tokenizer
    logger.info("tokenizing statement dataset...")
-    subprocess.run(["python", statement_root / "tokenize_seq2seq.py", statement_config_path])
+    subprocess.run(["uv", "run", statement_root / "tokenize_seq2seq.py", statement_config_path])

    # train statement model (4 gpus)
    logger.info("training statement model...")
    subprocess.run(
        [
+            "uv", "run",
            "torchrun",
            f"--nnodes={nnodes}",
            f"--nproc-per-node={nproc_per_node}",
@@ -41,7 +41,7 @@ from pylingual.control_flow_reconstruction.structure import bc_to_cft
 from pylingual.control_flow_reconstruction.cft import MetaTemplate
 from pylingual.equivalence_check import TestResult, compare_pyc
 from pylingual.models import CacheTranslator, load_models
-from pylingual.utils.generate_bytecode import CompileError, compile_version
+from pylingual.utils.generate_bytecode import CompileError, PyenvError, compile_version
 from pylingual.masking.model_disasm import create_global_masker, restore_masked_source_text
 from pylingual.editable_bytecode import PYCFile
 from pylingual.segmentation.segmentation_search_strategies import get_top_k_predictions, m_deep_top_k, naive_confidence_priority, filter_subwords
@@ -122,10 +122,6 @@ class Decompiler:
            self.run_cflow_reconstruction()
            self.reconstruct_source()

-            if shutil.which("pyenv") is None and self.version != sys.version_info:
-                logger.warning(f"pyenv is not installed so equivalence check cannot be performed. Please install pyenv manually along with the required Python version ({self.version}) or run PyLingual again with the --init-pyenv flag")
-                return DecompilerResult(self.indented_source, [TestResult(False, "Cannot compare equivalence without pyenv installed", bc, bc) for bc in self.pyc.iter_bytecodes()], self.pyc, self.version)
-
            self.equivalence_results = self.check_reconstruction(self.indented_source)
            self.correct_failures()

@@ -361,6 +357,9 @@ class Decompiler:
            compile_version(src, pyc, self.version)
        except CompileError as e:
            return [e]
+        except PyenvError as e:
+            logger.error(f"Could not check decompilation due to pyenv error: {e}")
+            return []
        else:
            return compare_pyc(self.pyc, pyc)

@@ -134,7 +134,6 @@ def main(files: list[str], out_dir: Path | None, config_file: Path | None, versi
                logger.exception(f"Failed to decompile {pyc_path}")
            console.rule()

-
 def install_pyenv():
    if shutil.which("pyenv") is not None:
        logger.warning("pyenv seems to already be installed, ignoring --init-pyenv...")
@@ -165,6 +164,5 @@ def install_pyenv():
        return False
    return True

-
 if __name__ == "__main__":
    main()
@@ -5,11 +5,16 @@ import sys
 import py_compile
 import platform
 import os
+import re
 import shutil

+
 from pylingual.utils.version import PythonVersion


+UV_VERSIONS = {PythonVersion((3, x)) for x in range(8, 14)}
+
+
 class CompileError(Exception):
    success = False
    bc_a = None
@@ -19,18 +24,32 @@ class PyenvError(Exception):
    pass


-def compile_version(py_file, out_file, version):
-    py_file = str(py_file)
-    out_file = str(out_file)
-    version = PythonVersion(version)
-    if version == sys.version_info:
-        try:
-            py_compile.compile(py_file, cfile=out_file, doraise=True, optimize=0)
-        except py_compile.PyCompileError as e:
-            raise CompileError(str(e))
-        return
+def _compile_native(py_file: str, out_file: str):
+    try:
+        py_compile.compile(py_file, cfile=out_file, doraise=True, optimize=0)
+    except py_compile.PyCompileError as e:
+        raise CompileError(str(e))
+    return

+
+def _compile_uv(py_file: str, out_file: str, version: PythonVersion):
+    compile_cmd = f"import py_compile, sys; assert sys.version_info[:2] == {version.as_tuple()!r}; py_compile.compile({py_file!r}, cfile={out_file!r})"
+
+    cmd = ["uvx", "--python", version.as_str(), "python", "-c", compile_cmd]
+
+    output = subprocess.run(cmd, shell=False, capture_output=True, text=True, env={**os.environ, "PYTHONWARNINGS": "ignore"})
+
+    # Ignore stderr messages from uv downloading versions on demand
+    stderr = re.sub(r'Downloading .+\n', '', output.stderr)
+    if stderr:
+        raise CompileError(stderr)
+
+
+def _compile_pyenv(py_file: str, out_file: str, version: PythonVersion):
    which_pyenv = shutil.which("pyenv")
+    if not which_pyenv:
+        raise PyenvError(f"Could not find pyenv installation to compile in version {version.as_str()}. Try running with --init-pyenv to enable verification for end-of-life Python versions.")
+
    version_win = None
    if platform.system() == "Windows":  # workaround for pyenv-win being bugged when passing versions like 3.x not 3.x.y

@@ -62,3 +81,20 @@ def compile_version(py_file, out_file, version):

    if output.stderr:
        raise CompileError(output.stderr)
+
+
+def compile_version(py_file, out_file, version):
+    py_file = str(py_file)
+    out_file = str(out_file)
+    version = PythonVersion(version)
+
+    if version == sys.version_info:
+        _compile_native(py_file=py_file, out_file=out_file)
+    elif version in UV_VERSIONS:
+        _compile_uv(py_file=py_file, out_file=out_file, version=version)
+    else:
+        _compile_pyenv(py_file=py_file, out_file=out_file, version=version)
+    
+    
+
+
@@ -1,16 +1,19 @@
-[tool.poetry]
-requires-poetry = ">=2.0"
-
 [project]
 name = "pylingual"
 version = "0.1.0"
 description = "A Python bytecode decompilation tool, supporting versions 3.6 - 3.13"
-authors = [ {name = "syssec-utd"} ]
-keywords = ["python", "decompilation", "pylingual", "reversing", "decompiler", "bytecode"]
-license = "GPL-3.0-only"
-
+authors = [{ name = "syssec-utd" }]
 readme = "README.md"
-requires-python = ">=3.12.0"
+requires-python = ">= 3.12"
+license = "GPL-3.0-only"
+keywords = [
+    "python",
+    "decompilation",
+    "pylingual",
+    "reversing",
+    "decompiler",
+    "bytecode",
+]
 dependencies = [
    "asttokens",
    "datasets",
@@ -28,7 +31,7 @@ dependencies = [
    "transformers==4.46.1",
    "transformers[torch]",
    "xdis>=6.1.4",
-    "click"
+    "click",
 ]

 [project.urls]
@@ -38,10 +41,9 @@ homepage = "https://pylingual.io"
 pylingual = "pylingual.main:main"

 [build-system]
-requires = ["poetry-core>=2.0.0,<3.0.0"]
-build-backend = "poetry.core.masonry.api"
+requires = ["hatchling"]
+build-backend = "hatchling.build"

-# linter and formatter
 [tool.ruff]
 # Exclude commonly ignored directories.
 exclude = [
@@ -55,7 +57,6 @@ exclude = [
    ".mypy_cache",
    ".nox",
    ".pants.d",
-    ".pyenv",
    ".pytest_cache",
    ".pytype",
    ".ruff_cache",
@@ -94,3 +95,8 @@ skip-magic-trailing-comma = false
 # to be opt-out in the future.
 docstring-code-format = true
 docstring-code-line-length = "dynamic"
+
+[tool.uv.workspace]
+members = [
+    "pylingual/tools",
+]