Add tokenizers and configurable context size limits (khanon/oai-reverse-proxy!28)

This commit is contained in:
khanon
2023-07-22 00:11:32 +00:00
parent 7634afeea4
commit 56a4902599
23 changed files with 419 additions and 119 deletions
+4 -1
View File
@@ -45,6 +45,9 @@ export function writeErrorResponse(
res.write(`data: [DONE]\n\n`);
res.end();
} else {
if (req.debug) {
errorPayload.error.proxy_tokenizer_debug_info = req.debug;
}
res.status(statusCode).json(errorPayload);
}
}
@@ -86,7 +89,7 @@ export const handleInternalError = (
} else {
writeErrorResponse(req, res, 500, {
error: {
type: "proxy_rewriter_error",
type: "proxy_internal_error",
proxy_note: `Reverse proxy encountered an error before it could reach the upstream API.`,
message: err.message,
stack: err.stack,
-2
View File
@@ -41,8 +41,6 @@ export const addKey: ProxyRequestMiddleware = (proxyReq, req) => {
// For such cases, ignore the requested model entirely.
if (req.inboundApi === "openai" && req.outboundApi === "anthropic") {
req.log.debug("Using an Anthropic key for an OpenAI-compatible request");
// We don't assign the model here, that will happen when transforming the
// request body.
assignedKey = keyPool.get("claude-v1");
} else {
assignedKey = keyPool.get(req.body.model);
@@ -0,0 +1,135 @@
import { Request } from "express";
import { z } from "zod";
import { config } from "../../../config";
import { countTokens } from "../../../tokenization";
import { RequestPreprocessor } from ".";
const CLAUDE_MAX_CONTEXT = config.maxContextTokensAnthropic;
const OPENAI_MAX_CONTEXT = config.maxContextTokensOpenAI;
/**
* Claude models don't throw an error if you exceed the token limit and
* instead just become extremely slow and provide schizo output. To be safe,
* we will only allow 95% of the stated limit, which also accounts for our
* tokenization being slightly different than Anthropic's.
*/
const CLAUDE_TOKEN_LIMIT_ADJUSTMENT = 0.95;
/**
* Assigns `req.promptTokens` and `req.outputTokens` based on the request body
* and outbound API format, which combined determine the size of the context.
* If the context is too large, an error is thrown.
* This preprocessor should run after any preprocessor that transforms the
* request body.
*/
export const checkContextSize: RequestPreprocessor = async (req) => {
let prompt;
switch (req.outboundApi) {
case "openai":
req.outputTokens = req.body.max_tokens;
prompt = req.body.messages;
break;
case "anthropic":
req.outputTokens = req.body.max_tokens_to_sample;
prompt = req.body.prompt;
break;
default:
throw new Error(`Unknown outbound API: ${req.outboundApi}`);
}
const result = await countTokens({ req, prompt, service: req.outboundApi });
req.promptTokens = result.token_count;
// TODO: Remove once token counting is stable
req.log.debug({ result: result }, "Counted prompt tokens.");
req.debug = req.debug ?? {};
req.debug = { ...req.debug, ...result };
maybeReassignModel(req);
validateContextSize(req);
};
function validateContextSize(req: Request) {
assertRequestHasTokenCounts(req);
const promptTokens = req.promptTokens;
const outputTokens = req.outputTokens;
const contextTokens = promptTokens + outputTokens;
const model = req.body.model;
const proxyMax =
(req.outboundApi === "openai" ? OPENAI_MAX_CONTEXT : CLAUDE_MAX_CONTEXT) ||
Number.MAX_SAFE_INTEGER;
let modelMax = 0;
if (model.match(/gpt-3.5/)) {
modelMax = 4096;
} else if (model.match(/gpt-4/)) {
modelMax = 8192;
} else if (model.match(/gpt-4-32k/)) {
modelMax = 32768;
} else if (model.match(/claude-(?:instant-)?v1(?:\.\d)?(?:-100k)/)) {
modelMax = 100000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
} else if (model.match(/claude-(?:instant-)?v1(?:\.\d)?$/)) {
modelMax = 9000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
} else if (model.match(/claude-2/)) {
modelMax = 100000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
} else {
// Don't really want to throw here because I don't want to have to update
// this ASAP every time a new model is released.
req.log.warn({ model }, "Unknown model, using 100k token limit.");
modelMax = 100000;
}
const finalMax = Math.min(proxyMax, modelMax);
z.number()
.int()
.max(finalMax, {
message: `Your request exceeds the context size limit for this model or proxy. (max: ${finalMax} tokens, requested: ${promptTokens} prompt + ${outputTokens} output = ${contextTokens} context tokens)`,
})
.parse(contextTokens);
req.log.debug(
{ promptTokens, outputTokens, contextTokens, modelMax, proxyMax },
"Prompt size validated"
);
req.debug.prompt_tokens = promptTokens;
req.debug.max_model_tokens = modelMax;
req.debug.max_proxy_tokens = proxyMax;
}
function assertRequestHasTokenCounts(
req: Request
): asserts req is Request & { promptTokens: number; outputTokens: number } {
z.object({
promptTokens: z.number().int().min(1),
outputTokens: z.number().int().min(1),
})
.nonstrict()
.parse(req);
}
/**
* For OpenAI-to-Anthropic requests, users can't specify the model, so we need
* to pick one based on the final context size. Ideally this would happen in
* the `transformOutboundPayload` preprocessor, but we don't have the context
* size at that point (and need a transformed body to calculate it).
*/
function maybeReassignModel(req: Request) {
if (req.inboundApi !== "openai" || req.outboundApi !== "anthropic") {
return;
}
const bigModel = process.env.CLAUDE_BIG_MODEL || "claude-v1-100k";
const contextSize = req.promptTokens! + req.outputTokens!;
if (contextSize > 8500) {
req.log.debug(
{ model: bigModel, contextSize },
"Using Claude 100k model for OpenAI-to-Anthropic request"
);
req.body.model = bigModel;
}
// Small model is the default already set in `transformOutboundPayload`
}
+1 -1
View File
@@ -4,6 +4,7 @@ import type { ProxyReqCallback } from "http-proxy";
// Express middleware (runs before http-proxy-middleware, can be async)
export { createPreprocessorMiddleware } from "./preprocess";
export { checkContextSize } from "./check-context-size";
export { setApiFormat } from "./set-api-format";
export { transformOutboundPayload } from "./transform-outbound-payload";
@@ -14,7 +15,6 @@ export { blockZoomerOrigins } from "./block-zoomer-origins";
export { finalizeBody } from "./finalize-body";
export { languageFilter } from "./language-filter";
export { limitCompletions } from "./limit-completions";
export { limitOutputTokens } from "./limit-output-tokens";
export { removeOriginHeaders } from "./remove-origin-headers";
export { transformKoboldPayload } from "./transform-kobold-payload";
@@ -1,46 +0,0 @@
import { Request } from "express";
import { config } from "../../../config";
import { isCompletionRequest } from "../common";
import { ProxyRequestMiddleware } from ".";
/** Enforce a maximum number of tokens requested from the model. */
export const limitOutputTokens: ProxyRequestMiddleware = (_proxyReq, req) => {
// TODO: do all of this shit in the zod validator
if (isCompletionRequest(req)) {
const requestedMax = Number.parseInt(getMaxTokensFromRequest(req));
const apiMax =
req.outboundApi === "openai"
? config.maxOutputTokensOpenAI
: config.maxOutputTokensAnthropic;
let maxTokens = requestedMax;
if (typeof requestedMax !== "number") {
maxTokens = apiMax;
}
maxTokens = Math.min(maxTokens, apiMax);
if (req.outboundApi === "openai") {
req.body.max_tokens = maxTokens;
} else if (req.outboundApi === "anthropic") {
req.body.max_tokens_to_sample = maxTokens;
}
if (requestedMax !== maxTokens) {
req.log.info(
{ requestedMax, configMax: apiMax, final: maxTokens },
"Limiting user's requested max output tokens"
);
}
}
};
function getMaxTokensFromRequest(req: Request) {
switch (req.outboundApi) {
case "anthropic":
return req.body?.max_tokens_to_sample;
case "openai":
return req.body?.max_tokens;
default:
throw new Error(`Unknown service: ${req.outboundApi}`);
}
}
+7 -1
View File
@@ -1,6 +1,11 @@
import { RequestHandler } from "express";
import { handleInternalError } from "../common";
import { RequestPreprocessor, setApiFormat, transformOutboundPayload } from ".";
import {
RequestPreprocessor,
checkContextSize,
setApiFormat,
transformOutboundPayload,
} from ".";
/**
* Returns a middleware function that processes the request body into the given
@@ -13,6 +18,7 @@ export const createPreprocessorMiddleware = (
const preprocessors: RequestPreprocessor[] = [
setApiFormat(apiFormat),
transformOutboundPayload,
checkContextSize,
...(additionalPreprocessors ?? []),
];
@@ -1,8 +1,12 @@
import { Request } from "express";
import { z } from "zod";
import { config } from "../../../config";
import { OpenAIPromptMessage } from "../../../tokenization";
import { isCompletionRequest } from "../common";
import { RequestPreprocessor } from ".";
// import { countTokens } from "../../../tokenization";
const CLAUDE_OUTPUT_MAX = config.maxOutputTokensAnthropic;
const OPENAI_OUTPUT_MAX = config.maxOutputTokensOpenAI;
// https://console.anthropic.com/docs/api/reference#-v1-complete
const AnthropicV1CompleteSchema = z.object({
@@ -11,7 +15,10 @@ const AnthropicV1CompleteSchema = z.object({
required_error:
"No prompt found. Are you sending an OpenAI-formatted request to the Claude endpoint?",
}),
max_tokens_to_sample: z.coerce.number(),
max_tokens_to_sample: z.coerce
.number()
.int()
.transform((v) => Math.min(v, CLAUDE_OUTPUT_MAX)),
stop_sequences: z.array(z.string()).optional(),
stream: z.boolean().optional().default(false),
temperature: z.coerce.number().optional().default(1),
@@ -32,6 +39,8 @@ const OpenAIV1ChatCompletionSchema = z.object({
{
required_error:
"No prompt found. Are you sending an Anthropic-formatted request to the OpenAI endpoint?",
invalid_type_error:
"Messages were not formatted correctly. Refer to the OpenAI Chat API documentation for more information.",
}
),
temperature: z.number().optional().default(1),
@@ -45,7 +54,12 @@ const OpenAIV1ChatCompletionSchema = z.object({
.optional(),
stream: z.boolean().optional().default(false),
stop: z.union([z.string(), z.array(z.string())]).optional(),
max_tokens: z.coerce.number().optional(),
max_tokens: z.coerce
.number()
.int()
.optional()
.default(16)
.transform((v) => Math.min(v, OPENAI_OUTPUT_MAX)),
frequency_penalty: z.number().optional().default(0),
presence_penalty: z.number().optional().default(0),
logit_bias: z.any().optional(),
@@ -63,7 +77,6 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
}
if (sameService) {
// Just validate, don't transform.
const validator =
req.outboundApi === "openai"
? OpenAIV1ChatCompletionSchema
@@ -76,11 +89,12 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
);
throw result.error;
}
req.body = result.data;
return;
}
if (req.inboundApi === "openai" && req.outboundApi === "anthropic") {
req.body = openaiToAnthropic(req.body, req);
req.body = await openaiToAnthropic(req.body, req);
return;
}
@@ -89,7 +103,7 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
);
};
function openaiToAnthropic(body: any, req: Request) {
async function openaiToAnthropic(body: any, req: Request) {
const result = OpenAIV1ChatCompletionSchema.safeParse(body);
if (!result.success) {
req.log.error(
@@ -107,37 +121,7 @@ function openaiToAnthropic(body: any, req: Request) {
req.headers["anthropic-version"] = "2023-01-01";
const { messages, ...rest } = result.data;
const prompt =
result.data.messages
.map((m) => {
let role: string = m.role;
if (role === "assistant") {
role = "Assistant";
} else if (role === "system") {
role = "System";
} else if (role === "user") {
role = "Human";
}
// https://console.anthropic.com/docs/prompt-design
// `name` isn't supported by Anthropic but we can still try to use it.
return `\n\n${role}: ${m.name?.trim() ? `(as ${m.name}) ` : ""}${
m.content
}`;
})
.join("") + "\n\nAssistant: ";
// No longer defaulting to `claude-v1.2` because it seems to be in the process
// of being deprecated. `claude-v1` is the new default.
// If you have keys that can still use `claude-v1.2`, you can set the
// CLAUDE_BIG_MODEL and CLAUDE_SMALL_MODEL environment variables in your .env
// file.
const CLAUDE_BIG = process.env.CLAUDE_BIG_MODEL || "claude-v1-100k";
const CLAUDE_SMALL = process.env.CLAUDE_SMALL_MODEL || "claude-v1";
// TODO: Finish implementing tokenizer for more accurate model selection.
// This currently uses _character count_, not token count.
const model = prompt.length > 25000 ? CLAUDE_BIG : CLAUDE_SMALL;
const prompt = openAIMessagesToClaudePrompt(messages);
let stops = rest.stop
? Array.isArray(rest.stop)
@@ -154,9 +138,35 @@ function openaiToAnthropic(body: any, req: Request) {
return {
...rest,
model,
// Model may be overridden in `calculate-context-size.ts` to avoid having
// a circular dependency (`calculate-context-size.ts` needs an already-
// transformed request body to count tokens, but this function would like
// to know the count to select a model).
model: process.env.CLAUDE_SMALL_MODEL || "claude-v1",
prompt: prompt,
max_tokens_to_sample: rest.max_tokens,
stop_sequences: stops,
};
}
export function openAIMessagesToClaudePrompt(messages: OpenAIPromptMessage[]) {
return (
messages
.map((m) => {
let role: string = m.role;
if (role === "assistant") {
role = "Assistant";
} else if (role === "system") {
role = "System";
} else if (role === "user") {
role = "Human";
}
// https://console.anthropic.com/docs/prompt-design
// `name` isn't supported by Anthropic but we can still try to use it.
return `\n\n${role}: ${m.name?.trim() ? `(as ${m.name}) ` : ""}${
m.content
}`;
})
.join("") + "\n\nAssistant:"
);
}