From 8daa5dfd1f127c3f225722e413fb787260908a66 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 7 Jan 2026 17:03:27 +0000
Subject: [PATCH] Revert token limit validation - CodeT5 can handle sequences >
 512 tokens

Co-authored-by: jikk <862047+jikk@users.noreply.github.com>
---
 pylingual/models.py | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/pylingual/models.py b/pylingual/models.py
index 766bb0f..e6cd1d6 100644
--- a/pylingual/models.py
+++ b/pylingual/models.py
@@ -25,11 +25,6 @@ else:
 logger = logging.getLogger(__name__)
 
 
-class TokenLimitExceededError(Exception):
-    """Raised when input exceeds the model's maximum token length."""
-    pass
-
-
 # translator with caching
 class CacheTranslator:
     """
@@ -49,26 +44,6 @@ class CacheTranslator:
         return self.cache[item]
 
     def _translate_and_decode(self, translation_requests: TrackedDataset | list[str], batch_size: int = 32, **kwargs) -> list[str]:
-        # Check for inputs that exceed the model's maximum input length
-        # T5 models typically have a max input length of 512 tokens
-        # Try multiple possible attribute names for max length across different model types
-        model_max_length = 512  # Default fallback
-        for attr in ['n_positions', 'max_position_embeddings']:
-            if hasattr(self.translator.model.config, attr):
-                model_max_length = getattr(self.translator.model.config, attr)
-                break
-        else:
-            # If config doesn't have the attributes, try tokenizer
-            if hasattr(self.translator.tokenizer, 'model_max_length'):
-                model_max_length = self.translator.tokenizer.model_max_length
-        
-        for i, request in enumerate(translation_requests):
-            tokenized = self.translator.tokenizer(request, return_tensors="pt", truncation=False)
-            input_length = tokenized['input_ids'].shape[1]
-            
-            if input_length > model_max_length:
-                raise TokenLimitExceededError(f"Input {i} exceeds model maximum length ({input_length} > {model_max_length} tokens). This bytecode statement is too long to decompile.")
-        
         # return_tensors=True prevents standard postprocessing which skips special tokens
         translation_result = self.translator(translation_requests, return_tensors=True, batch_size=batch_size, **kwargs)
         decoded_results = []
@@ -95,11 +70,8 @@ class CacheTranslator:
         for request in translation_requests:
             try:
                 translation_results.append(self._translate_and_decode([request], batch_size=1)[0])
-            except TokenLimitExceededError:
-                # Token limit error - provide specific message
-                translation_results.append("'''Decompiler error: statement exceeds model token limit. This bytecode is too long to decompile automatically.'''")
             except Exception:
-                # Other translation errors
+                # last resort fallback
                 translation_results.append("'''Decompiler error: line too long for translation. Please decompile this statement manually.'''")
         return translation_results