Revert token limit validation - CodeT5 can handle sequences > 512 tokens

Co-authored-by: jikk <862047+jikk@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-01-07 17:03:27 +00:00
parent c8863bb4d3
commit 8daa5dfd1f
+1 -29
View File
@@ -25,11 +25,6 @@ else:
logger = logging.getLogger(__name__)
class TokenLimitExceededError(Exception):
"""Raised when input exceeds the model's maximum token length."""
pass
# translator with caching
class CacheTranslator:
"""
@@ -49,26 +44,6 @@ class CacheTranslator:
return self.cache[item]
def _translate_and_decode(self, translation_requests: TrackedDataset | list[str], batch_size: int = 32, **kwargs) -> list[str]:
# Check for inputs that exceed the model's maximum input length
# T5 models typically have a max input length of 512 tokens
# Try multiple possible attribute names for max length across different model types
model_max_length = 512 # Default fallback
for attr in ['n_positions', 'max_position_embeddings']:
if hasattr(self.translator.model.config, attr):
model_max_length = getattr(self.translator.model.config, attr)
break
else:
# If config doesn't have the attributes, try tokenizer
if hasattr(self.translator.tokenizer, 'model_max_length'):
model_max_length = self.translator.tokenizer.model_max_length
for i, request in enumerate(translation_requests):
tokenized = self.translator.tokenizer(request, return_tensors="pt", truncation=False)
input_length = tokenized['input_ids'].shape[1]
if input_length > model_max_length:
raise TokenLimitExceededError(f"Input {i} exceeds model maximum length ({input_length} > {model_max_length} tokens). This bytecode statement is too long to decompile.")
# return_tensors=True prevents standard postprocessing which skips special tokens
translation_result = self.translator(translation_requests, return_tensors=True, batch_size=batch_size, **kwargs)
decoded_results = []
@@ -95,11 +70,8 @@ class CacheTranslator:
for request in translation_requests:
try:
translation_results.append(self._translate_and_decode([request], batch_size=1)[0])
except TokenLimitExceededError:
# Token limit error - provide specific message
translation_results.append("'''Decompiler error: statement exceeds model token limit. This bytecode is too long to decompile automatically.'''")
except Exception:
# Other translation errors
# last resort fallback
translation_results.append("'''Decompiler error: line too long for translation. Please decompile this statement manually.'''")
return translation_results