Merge pull request #86 from syssec-utd/uv-migration

Migrate from Poetry to UV
This commit is contained in:
Joel-Flores123
2025-09-29 18:44:50 -05:00
committed by GitHub
16 changed files with 1814 additions and 3175 deletions
+1
View File
@@ -1,5 +1,6 @@
dataset/
venv/
.venv/
*.pyc
.idea/
.python-version
+6 -11
View File
@@ -8,26 +8,21 @@ This codebase is optimized for readability and future extension, so there may in
## Requirements
- Python 3.12
- `uv` Python package manager ([installation](https://docs.astral.sh/uv/getting-started/installation/)), used for project dependencies and managed Python versions.
### Compiling bytecode
### Decompiling End-Of-Life Python Versions
Some parts of PyLingual require the ability to compile bytecode in a different Python version (equivalence check and model training). For this, you will need the following:
To verify decompilation correctness and produce model training sets, PyLingual requires the ability to compile Python in the target version.
- [pyenv](https://github.com/pyenv/pyenv) with all Python versions you want to compile to
- [pyenv-win](https://github.com/pyenv-win/pyenv-win) for Windows
For current Python versions (3.8-3.13), PyLingual uses `uv`'s managed installations, but for Python 3.6 and 3.7, PyLingual uses [pyenv](https://github.com/pyenv/pyenv) ([pyenv-win](https://github.com/pyenv-win/pyenv-win) for Windows).
## Setup
Install from source, using [Poetry](https://python-poetry.org/):
Install from source, using [uv](https://docs.astral.sh/uv/):
```sh
git clone https://github.com/syssec-utd/pylingual
cd pylingual
python -m venv venv
source venv/bin/activate
pip install poetry>=2.0
poetry install
uv tool install ./pylingual
```
## Usage
+4 -4
View File
@@ -4,7 +4,7 @@ PyLingual's accuracy is dependent on having accurate segmentation and statement
## Dataset generation
First install [pyenv](https://github.com/pyenv/pyenv) and the required Python versions for the dataset. Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
The dataset directory should be structured like so, with only one `.py` file per directory:
@@ -24,7 +24,7 @@ dataset
The names of the inner directories and files do not matter. Then create the dataset:
```
python prepare_dataset.py <path to JSON>
uv run prepare_dataset.py <path to JSON>
```
## Segmentation model
@@ -32,7 +32,7 @@ python prepare_dataset.py <path to JSON>
Create a segmentation model JSON file based off the sample (`sample_jsons/py36-sample-segmentation.json`). Then train the model:
```
python train_models.py --segmentation <path to JSON>
uv run train_models.py --segmentation <path to JSON>
```
## Statement model
@@ -40,7 +40,7 @@ python train_models.py --segmentation <path to JSON>
Create a statement model JSON file based off the sample (`sample_jsons/py36-sample-statement.json`). Then train the model:
```
python train_models.py --statement <path to JSON>
uv run train_models.py --statement <path to JSON>
```
Once models are trained, update `../pylingual/decompiler_config.yaml` or create a separate config file by replacing the old models with the newly trained ones.
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../" }
# ///
import contextlib
import shutil
import json
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import csv
import itertools
import logging
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import itertools
import logging
import multiprocessing
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../" }
# ///
import json
import logging
import pathlib
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import ast
import functools
import os
+9
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../../" }
# ///
import logging
import os
import pathlib
+16 -4
View File
@@ -1,3 +1,12 @@
# /// script
# requires-python = ">= 3.12"
# dependencies = [
# "pylingual",
# ]
# [tool.uv.sources]
# pylingual = { path = "../" }
# ///
import logging
import os
import pathlib
@@ -12,12 +21,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
# train tokenizer
logger.info("training tokenizer...")
subprocess.run(["python", segmentation_root / "train_tokenizer.py", segmentation_config_path])
subprocess.run(["uv", "run", segmentation_root / "train_tokenizer.py", segmentation_config_path])
# train mlm (single gpu to avoid conflicts with local tokenized data)
logger.info("training masked language model...")
subprocess.run(
[
"uv", "run",
"torchrun",
f"--nnodes={nnodes}",
f"--nproc-per-node={nproc_per_node}",
@@ -31,12 +41,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
# tokenize dataset
logger.info("tokenizing segmentation dataset...")
subprocess.run(["python", segmentation_root / "tokenize_seg.py", segmentation_config_path])
subprocess.run(["uv", "run", segmentation_root / "tokenize_seg.py", segmentation_config_path])
# train segmentation model (4 gpus)
logger.info("training segmentation model...")
subprocess.run(
[
"uv", "run",
"torchrun",
f"--nnodes={nnodes}",
f"--nproc-per-node={nproc_per_node}",
@@ -53,16 +64,17 @@ def train_statement(statement_config_path: pathlib.Path, logger: logging.Logger,
statement_root = pathlib.Path(__file__).parent / "statement"
# manual tokenizer
subprocess.run(["python", statement_root / "train_tokenizer_auto.py", statement_config_path])
subprocess.run(["uv", "run", statement_root / "train_tokenizer_auto.py", statement_config_path])
# tokenize statement dataset with salesforce tokenizer
logger.info("tokenizing statement dataset...")
subprocess.run(["python", statement_root / "tokenize_seq2seq.py", statement_config_path])
subprocess.run(["uv", "run", statement_root / "tokenize_seq2seq.py", statement_config_path])
# train statement model (4 gpus)
logger.info("training statement model...")
subprocess.run(
[
"uv", "run",
"torchrun",
f"--nnodes={nnodes}",
f"--nproc-per-node={nproc_per_node}",
Generated
-3126
View File
File diff suppressed because it is too large Load Diff
+4 -5
View File
@@ -41,7 +41,7 @@ from pylingual.control_flow_reconstruction.structure import bc_to_cft
from pylingual.control_flow_reconstruction.cft import MetaTemplate
from pylingual.equivalence_check import TestResult, compare_pyc
from pylingual.models import CacheTranslator, load_models
from pylingual.utils.generate_bytecode import CompileError, compile_version
from pylingual.utils.generate_bytecode import CompileError, PyenvError, compile_version
from pylingual.masking.model_disasm import create_global_masker, restore_masked_source_text
from pylingual.editable_bytecode import PYCFile
from pylingual.segmentation.segmentation_search_strategies import get_top_k_predictions, m_deep_top_k, naive_confidence_priority, filter_subwords
@@ -122,10 +122,6 @@ class Decompiler:
self.run_cflow_reconstruction()
self.reconstruct_source()
if shutil.which("pyenv") is None and self.version != sys.version_info:
logger.warning(f"pyenv is not installed so equivalence check cannot be performed. Please install pyenv manually along with the required Python version ({self.version}) or run PyLingual again with the --init-pyenv flag")
return DecompilerResult(self.indented_source, [TestResult(False, "Cannot compare equivalence without pyenv installed", bc, bc) for bc in self.pyc.iter_bytecodes()], self.pyc, self.version)
self.equivalence_results = self.check_reconstruction(self.indented_source)
self.correct_failures()
@@ -361,6 +357,9 @@ class Decompiler:
compile_version(src, pyc, self.version)
except CompileError as e:
return [e]
except PyenvError as e:
logger.error(f"Could not check decompilation due to pyenv error: {e}")
return []
else:
return compare_pyc(self.pyc, pyc)
-2
View File
@@ -134,7 +134,6 @@ def main(files: list[str], out_dir: Path | None, config_file: Path | None, versi
logger.exception(f"Failed to decompile {pyc_path}")
console.rule()
def install_pyenv():
if shutil.which("pyenv") is not None:
logger.warning("pyenv seems to already be installed, ignoring --init-pyenv...")
@@ -165,6 +164,5 @@ def install_pyenv():
return False
return True
if __name__ == "__main__":
main()
+46 -10
View File
@@ -5,11 +5,16 @@ import sys
import py_compile
import platform
import os
import re
import shutil
from pylingual.utils.version import PythonVersion
UV_VERSIONS = {PythonVersion((3, x)) for x in range(8, 14)}
class CompileError(Exception):
success = False
bc_a = None
@@ -19,18 +24,32 @@ class PyenvError(Exception):
pass
def compile_version(py_file, out_file, version):
py_file = str(py_file)
out_file = str(out_file)
version = PythonVersion(version)
if version == sys.version_info:
try:
py_compile.compile(py_file, cfile=out_file, doraise=True, optimize=0)
except py_compile.PyCompileError as e:
raise CompileError(str(e))
return
def _compile_native(py_file: str, out_file: str):
try:
py_compile.compile(py_file, cfile=out_file, doraise=True, optimize=0)
except py_compile.PyCompileError as e:
raise CompileError(str(e))
return
def _compile_uv(py_file: str, out_file: str, version: PythonVersion):
compile_cmd = f"import py_compile, sys; assert sys.version_info[:2] == {version.as_tuple()!r}; py_compile.compile({py_file!r}, cfile={out_file!r})"
cmd = ["uvx", "--python", version.as_str(), "python", "-c", compile_cmd]
output = subprocess.run(cmd, shell=False, capture_output=True, text=True, env={**os.environ, "PYTHONWARNINGS": "ignore"})
# Ignore stderr messages from uv downloading versions on demand
stderr = re.sub(r'Downloading .+\n', '', output.stderr)
if stderr:
raise CompileError(stderr)
def _compile_pyenv(py_file: str, out_file: str, version: PythonVersion):
which_pyenv = shutil.which("pyenv")
if not which_pyenv:
raise PyenvError(f"Could not find pyenv installation to compile in version {version.as_str()}. Try running with --init-pyenv to enable verification for end-of-life Python versions.")
version_win = None
if platform.system() == "Windows": # workaround for pyenv-win being bugged when passing versions like 3.x not 3.x.y
@@ -62,3 +81,20 @@ def compile_version(py_file, out_file, version):
if output.stderr:
raise CompileError(output.stderr)
def compile_version(py_file, out_file, version):
py_file = str(py_file)
out_file = str(out_file)
version = PythonVersion(version)
if version == sys.version_info:
_compile_native(py_file=py_file, out_file=out_file)
elif version in UV_VERSIONS:
_compile_uv(py_file=py_file, out_file=out_file, version=version)
else:
_compile_pyenv(py_file=py_file, out_file=out_file, version=version)
+19 -13
View File
@@ -1,16 +1,19 @@
[tool.poetry]
requires-poetry = ">=2.0"
[project]
name = "pylingual"
version = "0.1.0"
description = "A Python bytecode decompilation tool, supporting versions 3.6 - 3.13"
authors = [ {name = "syssec-utd"} ]
keywords = ["python", "decompilation", "pylingual", "reversing", "decompiler", "bytecode"]
license = "GPL-3.0-only"
authors = [{ name = "syssec-utd" }]
readme = "README.md"
requires-python = ">=3.12.0"
requires-python = ">= 3.12"
license = "GPL-3.0-only"
keywords = [
"python",
"decompilation",
"pylingual",
"reversing",
"decompiler",
"bytecode",
]
dependencies = [
"asttokens",
"datasets",
@@ -28,7 +31,7 @@ dependencies = [
"transformers==4.46.1",
"transformers[torch]",
"xdis>=6.1.4",
"click"
"click",
]
[project.urls]
@@ -38,10 +41,9 @@ homepage = "https://pylingual.io"
pylingual = "pylingual.main:main"
[build-system]
requires = ["poetry-core>=2.0.0,<3.0.0"]
build-backend = "poetry.core.masonry.api"
requires = ["hatchling"]
build-backend = "hatchling.build"
# linter and formatter
[tool.ruff]
# Exclude commonly ignored directories.
exclude = [
@@ -55,7 +57,6 @@ exclude = [
".mypy_cache",
".nox",
".pants.d",
".pyenv",
".pytest_cache",
".pytype",
".ruff_cache",
@@ -94,3 +95,8 @@ skip-magic-trailing-comma = false
# to be opt-out in the future.
docstring-code-format = true
docstring-code-line-length = "dynamic"
[tool.uv.workspace]
members = [
"pylingual/tools",
]
Generated
+1664
View File
File diff suppressed because it is too large Load Diff