mirror of
https://github.com/syssec-utd/pylingual.git
synced 2026-05-10 18:39:03 -07:00
Merge pull request #86 from syssec-utd/uv-migration
Migrate from Poetry to UV
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
dataset/
|
||||
venv/
|
||||
.venv/
|
||||
*.pyc
|
||||
.idea/
|
||||
.python-version
|
||||
|
||||
@@ -8,26 +8,21 @@ This codebase is optimized for readability and future extension, so there may in
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.12
|
||||
- `uv` Python package manager ([installation](https://docs.astral.sh/uv/getting-started/installation/)), used for project dependencies and managed Python versions.
|
||||
|
||||
### Compiling bytecode
|
||||
### Decompiling End-Of-Life Python Versions
|
||||
|
||||
Some parts of PyLingual require the ability to compile bytecode in a different Python version (equivalence check and model training). For this, you will need the following:
|
||||
To verify decompilation correctness and produce model training sets, PyLingual requires the ability to compile Python in the target version.
|
||||
|
||||
- [pyenv](https://github.com/pyenv/pyenv) with all Python versions you want to compile to
|
||||
- [pyenv-win](https://github.com/pyenv-win/pyenv-win) for Windows
|
||||
For current Python versions (3.8-3.13), PyLingual uses `uv`'s managed installations, but for Python 3.6 and 3.7, PyLingual uses [pyenv](https://github.com/pyenv/pyenv) ([pyenv-win](https://github.com/pyenv-win/pyenv-win) for Windows).
|
||||
|
||||
## Setup
|
||||
|
||||
Install from source, using [Poetry](https://python-poetry.org/):
|
||||
Install from source, using [uv](https://docs.astral.sh/uv/):
|
||||
|
||||
```sh
|
||||
git clone https://github.com/syssec-utd/pylingual
|
||||
cd pylingual
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install poetry>=2.0
|
||||
poetry install
|
||||
uv tool install ./pylingual
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -4,7 +4,7 @@ PyLingual's accuracy is dependent on having accurate segmentation and statement
|
||||
|
||||
## Dataset generation
|
||||
|
||||
First install [pyenv](https://github.com/pyenv/pyenv) and the required Python versions for the dataset. Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
|
||||
Create a dataset JSON file based off the sample (`sample_jsons/py36-sample-data.json`).
|
||||
|
||||
The dataset directory should be structured like so, with only one `.py` file per directory:
|
||||
|
||||
@@ -24,7 +24,7 @@ dataset
|
||||
The names of the inner directories and files do not matter. Then create the dataset:
|
||||
|
||||
```
|
||||
python prepare_dataset.py <path to JSON>
|
||||
uv run prepare_dataset.py <path to JSON>
|
||||
```
|
||||
|
||||
## Segmentation model
|
||||
@@ -32,7 +32,7 @@ python prepare_dataset.py <path to JSON>
|
||||
Create a segmentation model JSON file based off the sample (`sample_jsons/py36-sample-segmentation.json`). Then train the model:
|
||||
|
||||
```
|
||||
python train_models.py --segmentation <path to JSON>
|
||||
uv run train_models.py --segmentation <path to JSON>
|
||||
```
|
||||
|
||||
## Statement model
|
||||
@@ -40,7 +40,7 @@ python train_models.py --segmentation <path to JSON>
|
||||
Create a statement model JSON file based off the sample (`sample_jsons/py36-sample-statement.json`). Then train the model:
|
||||
|
||||
```
|
||||
python train_models.py --statement <path to JSON>
|
||||
uv run train_models.py --statement <path to JSON>
|
||||
```
|
||||
|
||||
Once models are trained, update `../pylingual/decompiler_config.yaml` or create a separate config file by replacing the old models with the newly trained ones.
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../" }
|
||||
# ///
|
||||
|
||||
import contextlib
|
||||
import shutil
|
||||
import json
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import csv
|
||||
import itertools
|
||||
import logging
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import multiprocessing
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../" }
|
||||
# ///
|
||||
|
||||
import json
|
||||
import logging
|
||||
import pathlib
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import ast
|
||||
import functools
|
||||
import os
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../../" }
|
||||
# ///
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
# /// script
|
||||
# requires-python = ">= 3.12"
|
||||
# dependencies = [
|
||||
# "pylingual",
|
||||
# ]
|
||||
# [tool.uv.sources]
|
||||
# pylingual = { path = "../" }
|
||||
# ///
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
@@ -12,12 +21,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
|
||||
|
||||
# train tokenizer
|
||||
logger.info("training tokenizer...")
|
||||
subprocess.run(["python", segmentation_root / "train_tokenizer.py", segmentation_config_path])
|
||||
subprocess.run(["uv", "run", segmentation_root / "train_tokenizer.py", segmentation_config_path])
|
||||
|
||||
# train mlm (single gpu to avoid conflicts with local tokenized data)
|
||||
logger.info("training masked language model...")
|
||||
subprocess.run(
|
||||
[
|
||||
"uv", "run",
|
||||
"torchrun",
|
||||
f"--nnodes={nnodes}",
|
||||
f"--nproc-per-node={nproc_per_node}",
|
||||
@@ -31,12 +41,13 @@ def train_segmentation(segmentation_config_path: pathlib.Path, logger: logging.L
|
||||
|
||||
# tokenize dataset
|
||||
logger.info("tokenizing segmentation dataset...")
|
||||
subprocess.run(["python", segmentation_root / "tokenize_seg.py", segmentation_config_path])
|
||||
subprocess.run(["uv", "run", segmentation_root / "tokenize_seg.py", segmentation_config_path])
|
||||
|
||||
# train segmentation model (4 gpus)
|
||||
logger.info("training segmentation model...")
|
||||
subprocess.run(
|
||||
[
|
||||
"uv", "run",
|
||||
"torchrun",
|
||||
f"--nnodes={nnodes}",
|
||||
f"--nproc-per-node={nproc_per_node}",
|
||||
@@ -53,16 +64,17 @@ def train_statement(statement_config_path: pathlib.Path, logger: logging.Logger,
|
||||
statement_root = pathlib.Path(__file__).parent / "statement"
|
||||
|
||||
# manual tokenizer
|
||||
subprocess.run(["python", statement_root / "train_tokenizer_auto.py", statement_config_path])
|
||||
subprocess.run(["uv", "run", statement_root / "train_tokenizer_auto.py", statement_config_path])
|
||||
|
||||
# tokenize statement dataset with salesforce tokenizer
|
||||
logger.info("tokenizing statement dataset...")
|
||||
subprocess.run(["python", statement_root / "tokenize_seq2seq.py", statement_config_path])
|
||||
subprocess.run(["uv", "run", statement_root / "tokenize_seq2seq.py", statement_config_path])
|
||||
|
||||
# train statement model (4 gpus)
|
||||
logger.info("training statement model...")
|
||||
subprocess.run(
|
||||
[
|
||||
"uv", "run",
|
||||
"torchrun",
|
||||
f"--nnodes={nnodes}",
|
||||
f"--nproc-per-node={nproc_per_node}",
|
||||
|
||||
Generated
-3126
File diff suppressed because it is too large
Load Diff
@@ -41,7 +41,7 @@ from pylingual.control_flow_reconstruction.structure import bc_to_cft
|
||||
from pylingual.control_flow_reconstruction.cft import MetaTemplate
|
||||
from pylingual.equivalence_check import TestResult, compare_pyc
|
||||
from pylingual.models import CacheTranslator, load_models
|
||||
from pylingual.utils.generate_bytecode import CompileError, compile_version
|
||||
from pylingual.utils.generate_bytecode import CompileError, PyenvError, compile_version
|
||||
from pylingual.masking.model_disasm import create_global_masker, restore_masked_source_text
|
||||
from pylingual.editable_bytecode import PYCFile
|
||||
from pylingual.segmentation.segmentation_search_strategies import get_top_k_predictions, m_deep_top_k, naive_confidence_priority, filter_subwords
|
||||
@@ -122,10 +122,6 @@ class Decompiler:
|
||||
self.run_cflow_reconstruction()
|
||||
self.reconstruct_source()
|
||||
|
||||
if shutil.which("pyenv") is None and self.version != sys.version_info:
|
||||
logger.warning(f"pyenv is not installed so equivalence check cannot be performed. Please install pyenv manually along with the required Python version ({self.version}) or run PyLingual again with the --init-pyenv flag")
|
||||
return DecompilerResult(self.indented_source, [TestResult(False, "Cannot compare equivalence without pyenv installed", bc, bc) for bc in self.pyc.iter_bytecodes()], self.pyc, self.version)
|
||||
|
||||
self.equivalence_results = self.check_reconstruction(self.indented_source)
|
||||
self.correct_failures()
|
||||
|
||||
@@ -361,6 +357,9 @@ class Decompiler:
|
||||
compile_version(src, pyc, self.version)
|
||||
except CompileError as e:
|
||||
return [e]
|
||||
except PyenvError as e:
|
||||
logger.error(f"Could not check decompilation due to pyenv error: {e}")
|
||||
return []
|
||||
else:
|
||||
return compare_pyc(self.pyc, pyc)
|
||||
|
||||
|
||||
@@ -134,7 +134,6 @@ def main(files: list[str], out_dir: Path | None, config_file: Path | None, versi
|
||||
logger.exception(f"Failed to decompile {pyc_path}")
|
||||
console.rule()
|
||||
|
||||
|
||||
def install_pyenv():
|
||||
if shutil.which("pyenv") is not None:
|
||||
logger.warning("pyenv seems to already be installed, ignoring --init-pyenv...")
|
||||
@@ -165,6 +164,5 @@ def install_pyenv():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -5,11 +5,16 @@ import sys
|
||||
import py_compile
|
||||
import platform
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
|
||||
|
||||
from pylingual.utils.version import PythonVersion
|
||||
|
||||
|
||||
UV_VERSIONS = {PythonVersion((3, x)) for x in range(8, 14)}
|
||||
|
||||
|
||||
class CompileError(Exception):
|
||||
success = False
|
||||
bc_a = None
|
||||
@@ -19,18 +24,32 @@ class PyenvError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def compile_version(py_file, out_file, version):
|
||||
py_file = str(py_file)
|
||||
out_file = str(out_file)
|
||||
version = PythonVersion(version)
|
||||
if version == sys.version_info:
|
||||
try:
|
||||
py_compile.compile(py_file, cfile=out_file, doraise=True, optimize=0)
|
||||
except py_compile.PyCompileError as e:
|
||||
raise CompileError(str(e))
|
||||
return
|
||||
def _compile_native(py_file: str, out_file: str):
|
||||
try:
|
||||
py_compile.compile(py_file, cfile=out_file, doraise=True, optimize=0)
|
||||
except py_compile.PyCompileError as e:
|
||||
raise CompileError(str(e))
|
||||
return
|
||||
|
||||
|
||||
def _compile_uv(py_file: str, out_file: str, version: PythonVersion):
|
||||
compile_cmd = f"import py_compile, sys; assert sys.version_info[:2] == {version.as_tuple()!r}; py_compile.compile({py_file!r}, cfile={out_file!r})"
|
||||
|
||||
cmd = ["uvx", "--python", version.as_str(), "python", "-c", compile_cmd]
|
||||
|
||||
output = subprocess.run(cmd, shell=False, capture_output=True, text=True, env={**os.environ, "PYTHONWARNINGS": "ignore"})
|
||||
|
||||
# Ignore stderr messages from uv downloading versions on demand
|
||||
stderr = re.sub(r'Downloading .+\n', '', output.stderr)
|
||||
if stderr:
|
||||
raise CompileError(stderr)
|
||||
|
||||
|
||||
def _compile_pyenv(py_file: str, out_file: str, version: PythonVersion):
|
||||
which_pyenv = shutil.which("pyenv")
|
||||
if not which_pyenv:
|
||||
raise PyenvError(f"Could not find pyenv installation to compile in version {version.as_str()}. Try running with --init-pyenv to enable verification for end-of-life Python versions.")
|
||||
|
||||
version_win = None
|
||||
if platform.system() == "Windows": # workaround for pyenv-win being bugged when passing versions like 3.x not 3.x.y
|
||||
|
||||
@@ -62,3 +81,20 @@ def compile_version(py_file, out_file, version):
|
||||
|
||||
if output.stderr:
|
||||
raise CompileError(output.stderr)
|
||||
|
||||
|
||||
def compile_version(py_file, out_file, version):
|
||||
py_file = str(py_file)
|
||||
out_file = str(out_file)
|
||||
version = PythonVersion(version)
|
||||
|
||||
if version == sys.version_info:
|
||||
_compile_native(py_file=py_file, out_file=out_file)
|
||||
elif version in UV_VERSIONS:
|
||||
_compile_uv(py_file=py_file, out_file=out_file, version=version)
|
||||
else:
|
||||
_compile_pyenv(py_file=py_file, out_file=out_file, version=version)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
+19
-13
@@ -1,16 +1,19 @@
|
||||
[tool.poetry]
|
||||
requires-poetry = ">=2.0"
|
||||
|
||||
[project]
|
||||
name = "pylingual"
|
||||
version = "0.1.0"
|
||||
description = "A Python bytecode decompilation tool, supporting versions 3.6 - 3.13"
|
||||
authors = [ {name = "syssec-utd"} ]
|
||||
keywords = ["python", "decompilation", "pylingual", "reversing", "decompiler", "bytecode"]
|
||||
license = "GPL-3.0-only"
|
||||
|
||||
authors = [{ name = "syssec-utd" }]
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12.0"
|
||||
requires-python = ">= 3.12"
|
||||
license = "GPL-3.0-only"
|
||||
keywords = [
|
||||
"python",
|
||||
"decompilation",
|
||||
"pylingual",
|
||||
"reversing",
|
||||
"decompiler",
|
||||
"bytecode",
|
||||
]
|
||||
dependencies = [
|
||||
"asttokens",
|
||||
"datasets",
|
||||
@@ -28,7 +31,7 @@ dependencies = [
|
||||
"transformers==4.46.1",
|
||||
"transformers[torch]",
|
||||
"xdis>=6.1.4",
|
||||
"click"
|
||||
"click",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -38,10 +41,9 @@ homepage = "https://pylingual.io"
|
||||
pylingual = "pylingual.main:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
# linter and formatter
|
||||
[tool.ruff]
|
||||
# Exclude commonly ignored directories.
|
||||
exclude = [
|
||||
@@ -55,7 +57,6 @@ exclude = [
|
||||
".mypy_cache",
|
||||
".nox",
|
||||
".pants.d",
|
||||
".pyenv",
|
||||
".pytest_cache",
|
||||
".pytype",
|
||||
".ruff_cache",
|
||||
@@ -94,3 +95,8 @@ skip-magic-trailing-comma = false
|
||||
# to be opt-out in the future.
|
||||
docstring-code-format = true
|
||||
docstring-code-line-length = "dynamic"
|
||||
|
||||
[tool.uv.workspace]
|
||||
members = [
|
||||
"pylingual/tools",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user