fixes broken AWS rate limit backoff
This commit is contained in:
@@ -3,6 +3,7 @@ import { createProxyMiddleware } from "http-proxy-middleware";
|
||||
import { v4 } from "uuid";
|
||||
import { config } from "../config";
|
||||
import { logger } from "../logger";
|
||||
import { keyPool } from "../shared/key-management";
|
||||
import { createQueueMiddleware } from "./queue";
|
||||
import { ipLimiter } from "./rate-limit";
|
||||
import { handleProxyError } from "./middleware/common";
|
||||
@@ -134,6 +135,7 @@ const awsProxy = createQueueMiddleware(
|
||||
on: {
|
||||
proxyReq: createOnProxyReqHandler({
|
||||
pipeline: [
|
||||
(_, req) => keyPool.throttle(req.key!),
|
||||
applyQuotaLimits,
|
||||
// Credentials are added by signAwsRequest preprocessor
|
||||
languageFilter,
|
||||
|
||||
@@ -59,6 +59,7 @@ export const addKey: ProxyRequestMiddleware = (proxyReq, req) => {
|
||||
}
|
||||
}
|
||||
|
||||
keyPool.throttle(assignedKey);
|
||||
req.key = assignedKey;
|
||||
req.log.info(
|
||||
{
|
||||
|
||||
@@ -4,7 +4,7 @@ import { promisify } from "util";
|
||||
import {
|
||||
buildFakeSse,
|
||||
copySseResponseHeaders,
|
||||
initializeSseStream
|
||||
initializeSseStream,
|
||||
} from "../../../shared/streaming";
|
||||
import { enqueue } from "../../queue";
|
||||
import { decodeResponseBody, RawResponseBodyHandler, RetryableError } from ".";
|
||||
@@ -83,7 +83,7 @@ export const handleStreamedResponse: RawResponseBodyHandler = async (
|
||||
return aggregator.getFinalResponse();
|
||||
} catch (err) {
|
||||
if (err instanceof RetryableError) {
|
||||
keyPool.markRateLimited(req.key!)
|
||||
keyPool.markRateLimited(req.key!);
|
||||
req.log.warn(
|
||||
{ key: req.key!.hash, retryCount: req.retryCount },
|
||||
`Re-enqueueing request due to retryable error during streaming response.`
|
||||
|
||||
@@ -153,11 +153,6 @@ export class AnthropicKeyProvider implements KeyProvider<AnthropicKey> {
|
||||
|
||||
const selectedKey = keysByPriority[0];
|
||||
selectedKey.lastUsed = now;
|
||||
selectedKey.rateLimitedAt = now;
|
||||
// Intended to throttle the queue processor as otherwise it will just
|
||||
// flood the API with requests and we want to wait a sec to see if we're
|
||||
// going to get a rate limit error on this key.
|
||||
selectedKey.rateLimitedUntil = now + KEY_REUSE_DELAY;
|
||||
return { ...selectedKey };
|
||||
}
|
||||
|
||||
@@ -226,4 +221,11 @@ export class AnthropicKeyProvider implements KeyProvider<AnthropicKey> {
|
||||
});
|
||||
this.checker?.scheduleNextCheck();
|
||||
}
|
||||
|
||||
public throttle(hash: string) {
|
||||
const key = this.keys.find((k) => k.hash === hash)!;
|
||||
const now = Date.now();
|
||||
key.rateLimitedAt = now;
|
||||
key.rateLimitedUntil = now + KEY_REUSE_DELAY;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,13 +37,13 @@ export interface AwsBedrockKey extends Key, AwsBedrockKeyUsage {
|
||||
* Upon being rate limited, a key will be locked out for this many milliseconds
|
||||
* while we wait for other concurrent requests to finish.
|
||||
*/
|
||||
const RATE_LIMIT_LOCKOUT = 1000;
|
||||
const RATE_LIMIT_LOCKOUT = 4000;
|
||||
/**
|
||||
* Upon assigning a key, we will wait this many milliseconds before allowing it
|
||||
* to be used again. This is to prevent the queue from flooding a key with too
|
||||
* many requests while we wait to learn whether previous ones succeeded.
|
||||
*/
|
||||
const KEY_REUSE_DELAY = 500;
|
||||
const KEY_REUSE_DELAY = 250;
|
||||
|
||||
export class AwsBedrockKeyProvider implements KeyProvider<AwsBedrockKey> {
|
||||
readonly service = "aws";
|
||||
@@ -131,11 +131,6 @@ export class AwsBedrockKeyProvider implements KeyProvider<AwsBedrockKey> {
|
||||
|
||||
const selectedKey = keysByPriority[0];
|
||||
selectedKey.lastUsed = now;
|
||||
selectedKey.rateLimitedAt = now;
|
||||
// Intended to throttle the queue processor as otherwise it will just
|
||||
// flood the API with requests and we want to wait a sec to see if we're
|
||||
// going to get a rate limit error on this key.
|
||||
selectedKey.rateLimitedUntil = now + KEY_REUSE_DELAY;
|
||||
return { ...selectedKey };
|
||||
}
|
||||
|
||||
@@ -199,4 +194,11 @@ export class AwsBedrockKeyProvider implements KeyProvider<AwsBedrockKey> {
|
||||
this.update(hash, { lastChecked: 0, isDisabled: false })
|
||||
);
|
||||
}
|
||||
|
||||
public throttle(hash: string) {
|
||||
const key = this.keys.find((k) => k.hash === hash)!;
|
||||
const now = Date.now();
|
||||
key.rateLimitedAt = now;
|
||||
key.rateLimitedUntil = now + KEY_REUSE_DELAY;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,6 +63,7 @@ export interface KeyProvider<T extends Key = Key> {
|
||||
getLockoutPeriod(model: Model): number;
|
||||
markRateLimited(hash: string): void;
|
||||
recheck(): void;
|
||||
throttle(hash: string): void;
|
||||
}
|
||||
|
||||
export const keyPool = new KeyPool();
|
||||
@@ -80,4 +81,4 @@ export {
|
||||
export { AnthropicKey } from "./anthropic/provider";
|
||||
export { OpenAIKey } from "./openai/provider";
|
||||
export { GooglePalmKey } from "./palm/provider";
|
||||
export { AwsBedrockKey } from "./aws/provider";
|
||||
export { AwsBedrockKey } from "./aws/provider";
|
||||
|
||||
@@ -72,6 +72,11 @@ export class KeyPool {
|
||||
}, 0);
|
||||
}
|
||||
|
||||
public throttle(key: Key) {
|
||||
const provider = this.getKeyProvider(key.service);
|
||||
provider.throttle(key.hash);
|
||||
}
|
||||
|
||||
public incrementUsage(key: Key, model: string, tokens: number): void {
|
||||
const provider = this.getKeyProvider(key.service);
|
||||
provider.incrementUsage(key.hash, model, tokens);
|
||||
|
||||
@@ -221,15 +221,6 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
|
||||
|
||||
const selectedKey = keysByPriority[0];
|
||||
selectedKey.lastUsed = now;
|
||||
|
||||
// When a key is selected, we rate-limit it for a brief period of time to
|
||||
// prevent the queue processor from immediately flooding it with requests
|
||||
// while the initial request is still being processed (which is when we will
|
||||
// get new rate limit headers).
|
||||
// Instead, we will let a request through every second until the key
|
||||
// becomes fully saturated and locked out again.
|
||||
selectedKey.rateLimitedAt = now;
|
||||
selectedKey.rateLimitRequestsReset = KEY_REUSE_DELAY;
|
||||
return { ...selectedKey };
|
||||
}
|
||||
|
||||
@@ -383,20 +374,16 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
|
||||
this.checker?.scheduleNextCheck();
|
||||
}
|
||||
|
||||
/** Writes key status to disk. */
|
||||
// public writeKeyStatus() {
|
||||
// const keys = this.keys.map((key) => ({
|
||||
// key: key.key,
|
||||
// isGpt4: key.isGpt4,
|
||||
// usage: key.usage,
|
||||
// hardLimit: key.hardLimit,
|
||||
// isDisabled: key.isDisabled,
|
||||
// }));
|
||||
// fs.writeFileSync(
|
||||
// path.join(__dirname, "..", "keys.json"),
|
||||
// JSON.stringify(keys, null, 2)
|
||||
// );
|
||||
// }
|
||||
/**
|
||||
* Called when a key is selected for a request, briefly disabling it to
|
||||
* avoid spamming the API with requests while we wait to learn whether this
|
||||
* key is already rate limited.
|
||||
*/
|
||||
public throttle(hash: string) {
|
||||
const key = this.keys.find((k) => k.hash === hash)!;
|
||||
key.rateLimitedAt = Date.now();
|
||||
key.rateLimitRequestsReset = KEY_REUSE_DELAY;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -122,11 +122,6 @@ export class GooglePalmKeyProvider implements KeyProvider<GooglePalmKey> {
|
||||
|
||||
const selectedKey = keysByPriority[0];
|
||||
selectedKey.lastUsed = now;
|
||||
selectedKey.rateLimitedAt = now;
|
||||
// Intended to throttle the queue processor as otherwise it will just
|
||||
// flood the API with requests and we want to wait a sec to see if we're
|
||||
// going to get a rate limit error on this key.
|
||||
selectedKey.rateLimitedUntil = now + KEY_REUSE_DELAY;
|
||||
return { ...selectedKey };
|
||||
}
|
||||
|
||||
@@ -186,4 +181,11 @@ export class GooglePalmKeyProvider implements KeyProvider<GooglePalmKey> {
|
||||
}
|
||||
|
||||
public recheck() {}
|
||||
|
||||
public throttle(hash: string) {
|
||||
const key = this.keys.find((k) => k.hash === hash)!;
|
||||
const now = Date.now();
|
||||
key.rateLimitedAt = now;
|
||||
key.rateLimitedUntil = now + KEY_REUSE_DELAY;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user