Revert token limit validation - CodeT5 can handle sequences > 512 tokens

Co-authored-by: jikk <862047+jikk@users.noreply.github.com>
2026-05-10 18:39:03 -07:00 · 2026-01-07 17:03:27 +00:00
parent c8863bb4d3
commit 8daa5dfd1f
1 changed files with 1 additions and 29 deletions
@@ -25,11 +25,6 @@ else:
 logger = logging.getLogger(__name__)


-class TokenLimitExceededError(Exception):
-    """Raised when input exceeds the model's maximum token length."""
-    pass
-
-
 # translator with caching
 class CacheTranslator:
    """
@@ -49,26 +44,6 @@ class CacheTranslator:
        return self.cache[item]

    def _translate_and_decode(self, translation_requests: TrackedDataset | list[str], batch_size: int = 32, **kwargs) -> list[str]:
-        # Check for inputs that exceed the model's maximum input length
-        # T5 models typically have a max input length of 512 tokens
-        # Try multiple possible attribute names for max length across different model types
-        model_max_length = 512  # Default fallback
-        for attr in ['n_positions', 'max_position_embeddings']:
-            if hasattr(self.translator.model.config, attr):
-                model_max_length = getattr(self.translator.model.config, attr)
-                break
-        else:
-            # If config doesn't have the attributes, try tokenizer
-            if hasattr(self.translator.tokenizer, 'model_max_length'):
-                model_max_length = self.translator.tokenizer.model_max_length
-        
-        for i, request in enumerate(translation_requests):
-            tokenized = self.translator.tokenizer(request, return_tensors="pt", truncation=False)
-            input_length = tokenized['input_ids'].shape[1]
-            
-            if input_length > model_max_length:
-                raise TokenLimitExceededError(f"Input {i} exceeds model maximum length ({input_length} > {model_max_length} tokens). This bytecode statement is too long to decompile.")
-        
        # return_tensors=True prevents standard postprocessing which skips special tokens
        translation_result = self.translator(translation_requests, return_tensors=True, batch_size=batch_size, **kwargs)
        decoded_results = []
@@ -95,11 +70,8 @@ class CacheTranslator:
        for request in translation_requests:
            try:
                translation_results.append(self._translate_and_decode([request], batch_size=1)[0])
-            except TokenLimitExceededError:
-                # Token limit error - provide specific message
-                translation_results.append("'''Decompiler error: statement exceeds model token limit. This bytecode is too long to decompile automatically.'''")
            except Exception:
-                # Other translation errors
+                # last resort fallback
                translation_results.append("'''Decompiler error: line too long for translation. Please decompile this statement manually.'''")
        return translation_results