From 8daa5dfd1f127c3f225722e413fb787260908a66 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 7 Jan 2026 17:03:27 +0000 Subject: [PATCH] Revert token limit validation - CodeT5 can handle sequences > 512 tokens Co-authored-by: jikk <862047+jikk@users.noreply.github.com> --- pylingual/models.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/pylingual/models.py b/pylingual/models.py index 766bb0f..e6cd1d6 100644 --- a/pylingual/models.py +++ b/pylingual/models.py @@ -25,11 +25,6 @@ else: logger = logging.getLogger(__name__) -class TokenLimitExceededError(Exception): - """Raised when input exceeds the model's maximum token length.""" - pass - - # translator with caching class CacheTranslator: """ @@ -49,26 +44,6 @@ class CacheTranslator: return self.cache[item] def _translate_and_decode(self, translation_requests: TrackedDataset | list[str], batch_size: int = 32, **kwargs) -> list[str]: - # Check for inputs that exceed the model's maximum input length - # T5 models typically have a max input length of 512 tokens - # Try multiple possible attribute names for max length across different model types - model_max_length = 512 # Default fallback - for attr in ['n_positions', 'max_position_embeddings']: - if hasattr(self.translator.model.config, attr): - model_max_length = getattr(self.translator.model.config, attr) - break - else: - # If config doesn't have the attributes, try tokenizer - if hasattr(self.translator.tokenizer, 'model_max_length'): - model_max_length = self.translator.tokenizer.model_max_length - - for i, request in enumerate(translation_requests): - tokenized = self.translator.tokenizer(request, return_tensors="pt", truncation=False) - input_length = tokenized['input_ids'].shape[1] - - if input_length > model_max_length: - raise TokenLimitExceededError(f"Input {i} exceeds model maximum length ({input_length} > {model_max_length} tokens). This bytecode statement is too long to decompile.") - # return_tensors=True prevents standard postprocessing which skips special tokens translation_result = self.translator(translation_requests, return_tensors=True, batch_size=batch_size, **kwargs) decoded_results = [] @@ -95,11 +70,8 @@ class CacheTranslator: for request in translation_requests: try: translation_results.append(self._translate_and_decode([request], batch_size=1)[0]) - except TokenLimitExceededError: - # Token limit error - provide specific message - translation_results.append("'''Decompiler error: statement exceeds model token limit. This bytecode is too long to decompile automatically.'''") except Exception: - # Other translation errors + # last resort fallback translation_results.append("'''Decompiler error: line too long for translation. Please decompile this statement manually.'''") return translation_results