Per-user token quotas and automatic quota refreshing (khanon/oai-reverse-proxy!37)
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
import { hasAvailableQuota } from "../../auth/user-store";
|
||||
import { isCompletionRequest } from "../common";
|
||||
import { ProxyRequestMiddleware } from ".";
|
||||
|
||||
export class QuotaExceededError extends Error {
|
||||
public quotaInfo: any;
|
||||
constructor(message: string, quotaInfo: any) {
|
||||
super(message);
|
||||
this.name = "QuotaExceededError";
|
||||
this.quotaInfo = quotaInfo;
|
||||
}
|
||||
}
|
||||
|
||||
export const applyQuotaLimits: ProxyRequestMiddleware = (_proxyReq, req) => {
|
||||
if (!isCompletionRequest(req) || !req.user) {
|
||||
return;
|
||||
}
|
||||
|
||||
const requestedTokens = (req.promptTokens ?? 0) + (req.outputTokens ?? 0);
|
||||
if (!hasAvailableQuota(req.user.token, req.body.model, requestedTokens)) {
|
||||
throw new QuotaExceededError(
|
||||
"You have exceeded your proxy token quota for this model.",
|
||||
{
|
||||
quota: req.user.tokenLimits,
|
||||
used: req.user.tokenCounts,
|
||||
requested: requestedTokens,
|
||||
}
|
||||
);
|
||||
}
|
||||
};
|
||||
@@ -1,7 +1,7 @@
|
||||
import { Request } from "express";
|
||||
import { z } from "zod";
|
||||
import { config } from "../../../config";
|
||||
import { countTokens } from "../../../tokenization";
|
||||
import { OpenAIPromptMessage, countTokens } from "../../../tokenization";
|
||||
import { RequestPreprocessor } from ".";
|
||||
|
||||
const CLAUDE_MAX_CONTEXT = config.maxContextTokensAnthropic;
|
||||
@@ -15,22 +15,26 @@ const OPENAI_MAX_CONTEXT = config.maxContextTokensOpenAI;
|
||||
* request body.
|
||||
*/
|
||||
export const checkContextSize: RequestPreprocessor = async (req) => {
|
||||
let prompt;
|
||||
const service = req.outboundApi;
|
||||
let result;
|
||||
|
||||
switch (req.outboundApi) {
|
||||
case "openai":
|
||||
switch (service) {
|
||||
case "openai": {
|
||||
req.outputTokens = req.body.max_tokens;
|
||||
prompt = req.body.messages;
|
||||
const prompt: OpenAIPromptMessage[] = req.body.messages;
|
||||
result = await countTokens({ req, prompt, service });
|
||||
break;
|
||||
case "anthropic":
|
||||
}
|
||||
case "anthropic": {
|
||||
req.outputTokens = req.body.max_tokens_to_sample;
|
||||
prompt = req.body.prompt;
|
||||
const prompt: string = req.body.prompt;
|
||||
result = await countTokens({ req, prompt, service });
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unknown outbound API: ${req.outboundApi}`);
|
||||
}
|
||||
|
||||
const result = await countTokens({ req, prompt, service: req.outboundApi });
|
||||
req.promptTokens = result.token_count;
|
||||
|
||||
// TODO: Remove once token counting is stable
|
||||
@@ -89,6 +93,7 @@ function validateContextSize(req: Request) {
|
||||
);
|
||||
|
||||
req.debug.prompt_tokens = promptTokens;
|
||||
req.debug.completion_tokens = outputTokens;
|
||||
req.debug.max_model_tokens = modelMax;
|
||||
req.debug.max_proxy_tokens = proxyMax;
|
||||
}
|
||||
@@ -101,7 +106,7 @@ function assertRequestHasTokenCounts(
|
||||
outputTokens: z.number().int().min(1),
|
||||
})
|
||||
.nonstrict()
|
||||
.parse(req);
|
||||
.parse({ promptTokens: req.promptTokens, outputTokens: req.outputTokens });
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -3,6 +3,7 @@ import type { ClientRequest } from "http";
|
||||
import type { ProxyReqCallback } from "http-proxy";
|
||||
|
||||
// Express middleware (runs before http-proxy-middleware, can be async)
|
||||
export { applyQuotaLimits } from "./apply-quota-limits";
|
||||
export { createPreprocessorMiddleware } from "./preprocess";
|
||||
export { checkContextSize } from "./check-context-size";
|
||||
export { setApiFormat } from "./set-api-format";
|
||||
|
||||
Reference in New Issue
Block a user