mirror of
https://github.com/syssec-utd/pylingual.git
synced 2026-05-10 18:39:03 -07:00
Revert token limit validation - CodeT5 can handle sequences > 512 tokens
Co-authored-by: jikk <862047+jikk@users.noreply.github.com>
This commit is contained in:
+1
-29
@@ -25,11 +25,6 @@ else:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TokenLimitExceededError(Exception):
|
||||
"""Raised when input exceeds the model's maximum token length."""
|
||||
pass
|
||||
|
||||
|
||||
# translator with caching
|
||||
class CacheTranslator:
|
||||
"""
|
||||
@@ -49,26 +44,6 @@ class CacheTranslator:
|
||||
return self.cache[item]
|
||||
|
||||
def _translate_and_decode(self, translation_requests: TrackedDataset | list[str], batch_size: int = 32, **kwargs) -> list[str]:
|
||||
# Check for inputs that exceed the model's maximum input length
|
||||
# T5 models typically have a max input length of 512 tokens
|
||||
# Try multiple possible attribute names for max length across different model types
|
||||
model_max_length = 512 # Default fallback
|
||||
for attr in ['n_positions', 'max_position_embeddings']:
|
||||
if hasattr(self.translator.model.config, attr):
|
||||
model_max_length = getattr(self.translator.model.config, attr)
|
||||
break
|
||||
else:
|
||||
# If config doesn't have the attributes, try tokenizer
|
||||
if hasattr(self.translator.tokenizer, 'model_max_length'):
|
||||
model_max_length = self.translator.tokenizer.model_max_length
|
||||
|
||||
for i, request in enumerate(translation_requests):
|
||||
tokenized = self.translator.tokenizer(request, return_tensors="pt", truncation=False)
|
||||
input_length = tokenized['input_ids'].shape[1]
|
||||
|
||||
if input_length > model_max_length:
|
||||
raise TokenLimitExceededError(f"Input {i} exceeds model maximum length ({input_length} > {model_max_length} tokens). This bytecode statement is too long to decompile.")
|
||||
|
||||
# return_tensors=True prevents standard postprocessing which skips special tokens
|
||||
translation_result = self.translator(translation_requests, return_tensors=True, batch_size=batch_size, **kwargs)
|
||||
decoded_results = []
|
||||
@@ -95,11 +70,8 @@ class CacheTranslator:
|
||||
for request in translation_requests:
|
||||
try:
|
||||
translation_results.append(self._translate_and_decode([request], batch_size=1)[0])
|
||||
except TokenLimitExceededError:
|
||||
# Token limit error - provide specific message
|
||||
translation_results.append("'''Decompiler error: statement exceeds model token limit. This bytecode is too long to decompile automatically.'''")
|
||||
except Exception:
|
||||
# Other translation errors
|
||||
# last resort fallback
|
||||
translation_results.append("'''Decompiler error: line too long for translation. Please decompile this statement manually.'''")
|
||||
return translation_results
|
||||
|
||||
|
||||
Reference in New Issue
Block a user