Add tokenizers and configurable context size limits (khanon/oai-reverse-proxy!28)
This commit is contained in:
@@ -45,6 +45,9 @@ export function writeErrorResponse(
|
||||
res.write(`data: [DONE]\n\n`);
|
||||
res.end();
|
||||
} else {
|
||||
if (req.debug) {
|
||||
errorPayload.error.proxy_tokenizer_debug_info = req.debug;
|
||||
}
|
||||
res.status(statusCode).json(errorPayload);
|
||||
}
|
||||
}
|
||||
@@ -86,7 +89,7 @@ export const handleInternalError = (
|
||||
} else {
|
||||
writeErrorResponse(req, res, 500, {
|
||||
error: {
|
||||
type: "proxy_rewriter_error",
|
||||
type: "proxy_internal_error",
|
||||
proxy_note: `Reverse proxy encountered an error before it could reach the upstream API.`,
|
||||
message: err.message,
|
||||
stack: err.stack,
|
||||
|
||||
@@ -41,8 +41,6 @@ export const addKey: ProxyRequestMiddleware = (proxyReq, req) => {
|
||||
// For such cases, ignore the requested model entirely.
|
||||
if (req.inboundApi === "openai" && req.outboundApi === "anthropic") {
|
||||
req.log.debug("Using an Anthropic key for an OpenAI-compatible request");
|
||||
// We don't assign the model here, that will happen when transforming the
|
||||
// request body.
|
||||
assignedKey = keyPool.get("claude-v1");
|
||||
} else {
|
||||
assignedKey = keyPool.get(req.body.model);
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
import { Request } from "express";
|
||||
import { z } from "zod";
|
||||
import { config } from "../../../config";
|
||||
import { countTokens } from "../../../tokenization";
|
||||
import { RequestPreprocessor } from ".";
|
||||
|
||||
const CLAUDE_MAX_CONTEXT = config.maxContextTokensAnthropic;
|
||||
const OPENAI_MAX_CONTEXT = config.maxContextTokensOpenAI;
|
||||
|
||||
/**
|
||||
* Claude models don't throw an error if you exceed the token limit and
|
||||
* instead just become extremely slow and provide schizo output. To be safe,
|
||||
* we will only allow 95% of the stated limit, which also accounts for our
|
||||
* tokenization being slightly different than Anthropic's.
|
||||
*/
|
||||
const CLAUDE_TOKEN_LIMIT_ADJUSTMENT = 0.95;
|
||||
|
||||
/**
|
||||
* Assigns `req.promptTokens` and `req.outputTokens` based on the request body
|
||||
* and outbound API format, which combined determine the size of the context.
|
||||
* If the context is too large, an error is thrown.
|
||||
* This preprocessor should run after any preprocessor that transforms the
|
||||
* request body.
|
||||
*/
|
||||
export const checkContextSize: RequestPreprocessor = async (req) => {
|
||||
let prompt;
|
||||
|
||||
switch (req.outboundApi) {
|
||||
case "openai":
|
||||
req.outputTokens = req.body.max_tokens;
|
||||
prompt = req.body.messages;
|
||||
break;
|
||||
case "anthropic":
|
||||
req.outputTokens = req.body.max_tokens_to_sample;
|
||||
prompt = req.body.prompt;
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unknown outbound API: ${req.outboundApi}`);
|
||||
}
|
||||
|
||||
const result = await countTokens({ req, prompt, service: req.outboundApi });
|
||||
req.promptTokens = result.token_count;
|
||||
|
||||
// TODO: Remove once token counting is stable
|
||||
req.log.debug({ result: result }, "Counted prompt tokens.");
|
||||
req.debug = req.debug ?? {};
|
||||
req.debug = { ...req.debug, ...result };
|
||||
|
||||
maybeReassignModel(req);
|
||||
validateContextSize(req);
|
||||
};
|
||||
|
||||
function validateContextSize(req: Request) {
|
||||
assertRequestHasTokenCounts(req);
|
||||
const promptTokens = req.promptTokens;
|
||||
const outputTokens = req.outputTokens;
|
||||
const contextTokens = promptTokens + outputTokens;
|
||||
const model = req.body.model;
|
||||
|
||||
const proxyMax =
|
||||
(req.outboundApi === "openai" ? OPENAI_MAX_CONTEXT : CLAUDE_MAX_CONTEXT) ||
|
||||
Number.MAX_SAFE_INTEGER;
|
||||
let modelMax = 0;
|
||||
|
||||
if (model.match(/gpt-3.5/)) {
|
||||
modelMax = 4096;
|
||||
} else if (model.match(/gpt-4/)) {
|
||||
modelMax = 8192;
|
||||
} else if (model.match(/gpt-4-32k/)) {
|
||||
modelMax = 32768;
|
||||
} else if (model.match(/claude-(?:instant-)?v1(?:\.\d)?(?:-100k)/)) {
|
||||
modelMax = 100000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
|
||||
} else if (model.match(/claude-(?:instant-)?v1(?:\.\d)?$/)) {
|
||||
modelMax = 9000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
|
||||
} else if (model.match(/claude-2/)) {
|
||||
modelMax = 100000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
|
||||
} else {
|
||||
// Don't really want to throw here because I don't want to have to update
|
||||
// this ASAP every time a new model is released.
|
||||
req.log.warn({ model }, "Unknown model, using 100k token limit.");
|
||||
modelMax = 100000;
|
||||
}
|
||||
|
||||
const finalMax = Math.min(proxyMax, modelMax);
|
||||
z.number()
|
||||
.int()
|
||||
.max(finalMax, {
|
||||
message: `Your request exceeds the context size limit for this model or proxy. (max: ${finalMax} tokens, requested: ${promptTokens} prompt + ${outputTokens} output = ${contextTokens} context tokens)`,
|
||||
})
|
||||
.parse(contextTokens);
|
||||
|
||||
req.log.debug(
|
||||
{ promptTokens, outputTokens, contextTokens, modelMax, proxyMax },
|
||||
"Prompt size validated"
|
||||
);
|
||||
|
||||
req.debug.prompt_tokens = promptTokens;
|
||||
req.debug.max_model_tokens = modelMax;
|
||||
req.debug.max_proxy_tokens = proxyMax;
|
||||
}
|
||||
|
||||
function assertRequestHasTokenCounts(
|
||||
req: Request
|
||||
): asserts req is Request & { promptTokens: number; outputTokens: number } {
|
||||
z.object({
|
||||
promptTokens: z.number().int().min(1),
|
||||
outputTokens: z.number().int().min(1),
|
||||
})
|
||||
.nonstrict()
|
||||
.parse(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* For OpenAI-to-Anthropic requests, users can't specify the model, so we need
|
||||
* to pick one based on the final context size. Ideally this would happen in
|
||||
* the `transformOutboundPayload` preprocessor, but we don't have the context
|
||||
* size at that point (and need a transformed body to calculate it).
|
||||
*/
|
||||
function maybeReassignModel(req: Request) {
|
||||
if (req.inboundApi !== "openai" || req.outboundApi !== "anthropic") {
|
||||
return;
|
||||
}
|
||||
|
||||
const bigModel = process.env.CLAUDE_BIG_MODEL || "claude-v1-100k";
|
||||
const contextSize = req.promptTokens! + req.outputTokens!;
|
||||
|
||||
if (contextSize > 8500) {
|
||||
req.log.debug(
|
||||
{ model: bigModel, contextSize },
|
||||
"Using Claude 100k model for OpenAI-to-Anthropic request"
|
||||
);
|
||||
req.body.model = bigModel;
|
||||
}
|
||||
// Small model is the default already set in `transformOutboundPayload`
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import type { ProxyReqCallback } from "http-proxy";
|
||||
|
||||
// Express middleware (runs before http-proxy-middleware, can be async)
|
||||
export { createPreprocessorMiddleware } from "./preprocess";
|
||||
export { checkContextSize } from "./check-context-size";
|
||||
export { setApiFormat } from "./set-api-format";
|
||||
export { transformOutboundPayload } from "./transform-outbound-payload";
|
||||
|
||||
@@ -14,7 +15,6 @@ export { blockZoomerOrigins } from "./block-zoomer-origins";
|
||||
export { finalizeBody } from "./finalize-body";
|
||||
export { languageFilter } from "./language-filter";
|
||||
export { limitCompletions } from "./limit-completions";
|
||||
export { limitOutputTokens } from "./limit-output-tokens";
|
||||
export { removeOriginHeaders } from "./remove-origin-headers";
|
||||
export { transformKoboldPayload } from "./transform-kobold-payload";
|
||||
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
import { Request } from "express";
|
||||
import { config } from "../../../config";
|
||||
import { isCompletionRequest } from "../common";
|
||||
import { ProxyRequestMiddleware } from ".";
|
||||
|
||||
/** Enforce a maximum number of tokens requested from the model. */
|
||||
export const limitOutputTokens: ProxyRequestMiddleware = (_proxyReq, req) => {
|
||||
// TODO: do all of this shit in the zod validator
|
||||
if (isCompletionRequest(req)) {
|
||||
const requestedMax = Number.parseInt(getMaxTokensFromRequest(req));
|
||||
const apiMax =
|
||||
req.outboundApi === "openai"
|
||||
? config.maxOutputTokensOpenAI
|
||||
: config.maxOutputTokensAnthropic;
|
||||
let maxTokens = requestedMax;
|
||||
|
||||
if (typeof requestedMax !== "number") {
|
||||
maxTokens = apiMax;
|
||||
}
|
||||
|
||||
maxTokens = Math.min(maxTokens, apiMax);
|
||||
if (req.outboundApi === "openai") {
|
||||
req.body.max_tokens = maxTokens;
|
||||
} else if (req.outboundApi === "anthropic") {
|
||||
req.body.max_tokens_to_sample = maxTokens;
|
||||
}
|
||||
|
||||
if (requestedMax !== maxTokens) {
|
||||
req.log.info(
|
||||
{ requestedMax, configMax: apiMax, final: maxTokens },
|
||||
"Limiting user's requested max output tokens"
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
function getMaxTokensFromRequest(req: Request) {
|
||||
switch (req.outboundApi) {
|
||||
case "anthropic":
|
||||
return req.body?.max_tokens_to_sample;
|
||||
case "openai":
|
||||
return req.body?.max_tokens;
|
||||
default:
|
||||
throw new Error(`Unknown service: ${req.outboundApi}`);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,11 @@
|
||||
import { RequestHandler } from "express";
|
||||
import { handleInternalError } from "../common";
|
||||
import { RequestPreprocessor, setApiFormat, transformOutboundPayload } from ".";
|
||||
import {
|
||||
RequestPreprocessor,
|
||||
checkContextSize,
|
||||
setApiFormat,
|
||||
transformOutboundPayload,
|
||||
} from ".";
|
||||
|
||||
/**
|
||||
* Returns a middleware function that processes the request body into the given
|
||||
@@ -13,6 +18,7 @@ export const createPreprocessorMiddleware = (
|
||||
const preprocessors: RequestPreprocessor[] = [
|
||||
setApiFormat(apiFormat),
|
||||
transformOutboundPayload,
|
||||
checkContextSize,
|
||||
...(additionalPreprocessors ?? []),
|
||||
];
|
||||
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
import { Request } from "express";
|
||||
import { z } from "zod";
|
||||
import { config } from "../../../config";
|
||||
import { OpenAIPromptMessage } from "../../../tokenization";
|
||||
import { isCompletionRequest } from "../common";
|
||||
import { RequestPreprocessor } from ".";
|
||||
// import { countTokens } from "../../../tokenization";
|
||||
|
||||
const CLAUDE_OUTPUT_MAX = config.maxOutputTokensAnthropic;
|
||||
const OPENAI_OUTPUT_MAX = config.maxOutputTokensOpenAI;
|
||||
|
||||
// https://console.anthropic.com/docs/api/reference#-v1-complete
|
||||
const AnthropicV1CompleteSchema = z.object({
|
||||
@@ -11,7 +15,10 @@ const AnthropicV1CompleteSchema = z.object({
|
||||
required_error:
|
||||
"No prompt found. Are you sending an OpenAI-formatted request to the Claude endpoint?",
|
||||
}),
|
||||
max_tokens_to_sample: z.coerce.number(),
|
||||
max_tokens_to_sample: z.coerce
|
||||
.number()
|
||||
.int()
|
||||
.transform((v) => Math.min(v, CLAUDE_OUTPUT_MAX)),
|
||||
stop_sequences: z.array(z.string()).optional(),
|
||||
stream: z.boolean().optional().default(false),
|
||||
temperature: z.coerce.number().optional().default(1),
|
||||
@@ -32,6 +39,8 @@ const OpenAIV1ChatCompletionSchema = z.object({
|
||||
{
|
||||
required_error:
|
||||
"No prompt found. Are you sending an Anthropic-formatted request to the OpenAI endpoint?",
|
||||
invalid_type_error:
|
||||
"Messages were not formatted correctly. Refer to the OpenAI Chat API documentation for more information.",
|
||||
}
|
||||
),
|
||||
temperature: z.number().optional().default(1),
|
||||
@@ -45,7 +54,12 @@ const OpenAIV1ChatCompletionSchema = z.object({
|
||||
.optional(),
|
||||
stream: z.boolean().optional().default(false),
|
||||
stop: z.union([z.string(), z.array(z.string())]).optional(),
|
||||
max_tokens: z.coerce.number().optional(),
|
||||
max_tokens: z.coerce
|
||||
.number()
|
||||
.int()
|
||||
.optional()
|
||||
.default(16)
|
||||
.transform((v) => Math.min(v, OPENAI_OUTPUT_MAX)),
|
||||
frequency_penalty: z.number().optional().default(0),
|
||||
presence_penalty: z.number().optional().default(0),
|
||||
logit_bias: z.any().optional(),
|
||||
@@ -63,7 +77,6 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
|
||||
}
|
||||
|
||||
if (sameService) {
|
||||
// Just validate, don't transform.
|
||||
const validator =
|
||||
req.outboundApi === "openai"
|
||||
? OpenAIV1ChatCompletionSchema
|
||||
@@ -76,11 +89,12 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
|
||||
);
|
||||
throw result.error;
|
||||
}
|
||||
req.body = result.data;
|
||||
return;
|
||||
}
|
||||
|
||||
if (req.inboundApi === "openai" && req.outboundApi === "anthropic") {
|
||||
req.body = openaiToAnthropic(req.body, req);
|
||||
req.body = await openaiToAnthropic(req.body, req);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -89,7 +103,7 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
|
||||
);
|
||||
};
|
||||
|
||||
function openaiToAnthropic(body: any, req: Request) {
|
||||
async function openaiToAnthropic(body: any, req: Request) {
|
||||
const result = OpenAIV1ChatCompletionSchema.safeParse(body);
|
||||
if (!result.success) {
|
||||
req.log.error(
|
||||
@@ -107,37 +121,7 @@ function openaiToAnthropic(body: any, req: Request) {
|
||||
req.headers["anthropic-version"] = "2023-01-01";
|
||||
|
||||
const { messages, ...rest } = result.data;
|
||||
const prompt =
|
||||
result.data.messages
|
||||
.map((m) => {
|
||||
let role: string = m.role;
|
||||
if (role === "assistant") {
|
||||
role = "Assistant";
|
||||
} else if (role === "system") {
|
||||
role = "System";
|
||||
} else if (role === "user") {
|
||||
role = "Human";
|
||||
}
|
||||
// https://console.anthropic.com/docs/prompt-design
|
||||
// `name` isn't supported by Anthropic but we can still try to use it.
|
||||
return `\n\n${role}: ${m.name?.trim() ? `(as ${m.name}) ` : ""}${
|
||||
m.content
|
||||
}`;
|
||||
})
|
||||
.join("") + "\n\nAssistant: ";
|
||||
|
||||
// No longer defaulting to `claude-v1.2` because it seems to be in the process
|
||||
// of being deprecated. `claude-v1` is the new default.
|
||||
// If you have keys that can still use `claude-v1.2`, you can set the
|
||||
// CLAUDE_BIG_MODEL and CLAUDE_SMALL_MODEL environment variables in your .env
|
||||
// file.
|
||||
|
||||
const CLAUDE_BIG = process.env.CLAUDE_BIG_MODEL || "claude-v1-100k";
|
||||
const CLAUDE_SMALL = process.env.CLAUDE_SMALL_MODEL || "claude-v1";
|
||||
|
||||
// TODO: Finish implementing tokenizer for more accurate model selection.
|
||||
// This currently uses _character count_, not token count.
|
||||
const model = prompt.length > 25000 ? CLAUDE_BIG : CLAUDE_SMALL;
|
||||
const prompt = openAIMessagesToClaudePrompt(messages);
|
||||
|
||||
let stops = rest.stop
|
||||
? Array.isArray(rest.stop)
|
||||
@@ -154,9 +138,35 @@ function openaiToAnthropic(body: any, req: Request) {
|
||||
|
||||
return {
|
||||
...rest,
|
||||
model,
|
||||
// Model may be overridden in `calculate-context-size.ts` to avoid having
|
||||
// a circular dependency (`calculate-context-size.ts` needs an already-
|
||||
// transformed request body to count tokens, but this function would like
|
||||
// to know the count to select a model).
|
||||
model: process.env.CLAUDE_SMALL_MODEL || "claude-v1",
|
||||
prompt: prompt,
|
||||
max_tokens_to_sample: rest.max_tokens,
|
||||
stop_sequences: stops,
|
||||
};
|
||||
}
|
||||
|
||||
export function openAIMessagesToClaudePrompt(messages: OpenAIPromptMessage[]) {
|
||||
return (
|
||||
messages
|
||||
.map((m) => {
|
||||
let role: string = m.role;
|
||||
if (role === "assistant") {
|
||||
role = "Assistant";
|
||||
} else if (role === "system") {
|
||||
role = "System";
|
||||
} else if (role === "user") {
|
||||
role = "Human";
|
||||
}
|
||||
// https://console.anthropic.com/docs/prompt-design
|
||||
// `name` isn't supported by Anthropic but we can still try to use it.
|
||||
return `\n\n${role}: ${m.name?.trim() ? `(as ${m.name}) ` : ""}${
|
||||
m.content
|
||||
}`;
|
||||
})
|
||||
.join("") + "\n\nAssistant:"
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user