Add tokenizers and configurable context size limits (khanon/oai-reverse-proxy!28)

2023-07-22 00:11:32 +00:00
parent 7634afeea4
commit 56a4902599
23 changed files with 419 additions and 119 deletions
@@ -45,6 +45,9 @@ export function writeErrorResponse(
    res.write(`data: [DONE]\n\n`);
    res.end();
  } else {
+    if (req.debug) {
+      errorPayload.error.proxy_tokenizer_debug_info = req.debug;
+    }
    res.status(statusCode).json(errorPayload);
  }
 }
@@ -86,7 +89,7 @@ export const handleInternalError = (
    } else {
      writeErrorResponse(req, res, 500, {
        error: {
-          type: "proxy_rewriter_error",
+          type: "proxy_internal_error",
          proxy_note: `Reverse proxy encountered an error before it could reach the upstream API.`,
          message: err.message,
          stack: err.stack,
@@ -41,8 +41,6 @@ export const addKey: ProxyRequestMiddleware = (proxyReq, req) => {
  // For such cases, ignore the requested model entirely.
  if (req.inboundApi === "openai" && req.outboundApi === "anthropic") {
    req.log.debug("Using an Anthropic key for an OpenAI-compatible request");
-    // We don't assign the model here, that will happen when transforming the
-    // request body.
    assignedKey = keyPool.get("claude-v1");
  } else {
    assignedKey = keyPool.get(req.body.model);
@@ -0,0 +1,135 @@
+import { Request } from "express";
+import { z } from "zod";
+import { config } from "../../../config";
+import { countTokens } from "../../../tokenization";
+import { RequestPreprocessor } from ".";
+
+const CLAUDE_MAX_CONTEXT = config.maxContextTokensAnthropic;
+const OPENAI_MAX_CONTEXT = config.maxContextTokensOpenAI;
+
+/**
+ * Claude models don't throw an error if you exceed the token limit and
+ * instead just become extremely slow and provide schizo output. To be safe,
+ * we will only allow 95% of the stated limit, which also accounts for our
+ * tokenization being slightly different than Anthropic's.
+ */
+const CLAUDE_TOKEN_LIMIT_ADJUSTMENT = 0.95;
+
+/**
+ * Assigns `req.promptTokens` and `req.outputTokens` based on the request body
+ * and outbound API format, which combined determine the size of the context.
+ * If the context is too large, an error is thrown.
+ * This preprocessor should run after any preprocessor that transforms the
+ * request body.
+ */
+export const checkContextSize: RequestPreprocessor = async (req) => {
+  let prompt;
+
+  switch (req.outboundApi) {
+    case "openai":
+      req.outputTokens = req.body.max_tokens;
+      prompt = req.body.messages;
+      break;
+    case "anthropic":
+      req.outputTokens = req.body.max_tokens_to_sample;
+      prompt = req.body.prompt;
+      break;
+    default:
+      throw new Error(`Unknown outbound API: ${req.outboundApi}`);
+  }
+
+  const result = await countTokens({ req, prompt, service: req.outboundApi });
+  req.promptTokens = result.token_count;
+
+  // TODO: Remove once token counting is stable
+  req.log.debug({ result: result }, "Counted prompt tokens.");
+  req.debug = req.debug ?? {};
+  req.debug = { ...req.debug, ...result };
+
+  maybeReassignModel(req);
+  validateContextSize(req);
+};
+
+function validateContextSize(req: Request) {
+  assertRequestHasTokenCounts(req);
+  const promptTokens = req.promptTokens;
+  const outputTokens = req.outputTokens;
+  const contextTokens = promptTokens + outputTokens;
+  const model = req.body.model;
+
+  const proxyMax =
+    (req.outboundApi === "openai" ? OPENAI_MAX_CONTEXT : CLAUDE_MAX_CONTEXT) ||
+    Number.MAX_SAFE_INTEGER;
+  let modelMax = 0;
+
+  if (model.match(/gpt-3.5/)) {
+    modelMax = 4096;
+  } else if (model.match(/gpt-4/)) {
+    modelMax = 8192;
+  } else if (model.match(/gpt-4-32k/)) {
+    modelMax = 32768;
+  } else if (model.match(/claude-(?:instant-)?v1(?:\.\d)?(?:-100k)/)) {
+    modelMax = 100000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
+  } else if (model.match(/claude-(?:instant-)?v1(?:\.\d)?$/)) {
+    modelMax = 9000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
+  } else if (model.match(/claude-2/)) {
+    modelMax = 100000 * CLAUDE_TOKEN_LIMIT_ADJUSTMENT;
+  } else {
+    // Don't really want to throw here because I don't want to have to update
+    // this ASAP every time a new model is released.
+    req.log.warn({ model }, "Unknown model, using 100k token limit.");
+    modelMax = 100000;
+  }
+
+  const finalMax = Math.min(proxyMax, modelMax);
+  z.number()
+    .int()
+    .max(finalMax, {
+      message: `Your request exceeds the context size limit for this model or proxy. (max: ${finalMax} tokens, requested: ${promptTokens} prompt + ${outputTokens} output = ${contextTokens} context tokens)`,
+    })
+    .parse(contextTokens);
+
+  req.log.debug(
+    { promptTokens, outputTokens, contextTokens, modelMax, proxyMax },
+    "Prompt size validated"
+  );
+
+  req.debug.prompt_tokens = promptTokens;
+  req.debug.max_model_tokens = modelMax;
+  req.debug.max_proxy_tokens = proxyMax;
+}
+
+function assertRequestHasTokenCounts(
+  req: Request
+): asserts req is Request & { promptTokens: number; outputTokens: number } {
+  z.object({
+    promptTokens: z.number().int().min(1),
+    outputTokens: z.number().int().min(1),
+  })
+    .nonstrict()
+    .parse(req);
+}
+
+/**
+ * For OpenAI-to-Anthropic requests, users can't specify the model, so we need
+ * to pick one based on the final context size. Ideally this would happen in
+ * the `transformOutboundPayload` preprocessor, but we don't have the context
+ * size at that point (and need a transformed body to calculate it).
+ */
+function maybeReassignModel(req: Request) {
+  if (req.inboundApi !== "openai" || req.outboundApi !== "anthropic") {
+    return;
+  }
+
+  const bigModel = process.env.CLAUDE_BIG_MODEL || "claude-v1-100k";
+  const contextSize = req.promptTokens! + req.outputTokens!;
+
+  if (contextSize > 8500) {
+    req.log.debug(
+      { model: bigModel, contextSize },
+      "Using Claude 100k model for OpenAI-to-Anthropic request"
+    );
+    req.body.model = bigModel;
+  }
+  // Small model is the default already set in `transformOutboundPayload`
+}
@@ -4,6 +4,7 @@ import type { ProxyReqCallback } from "http-proxy";

 // Express middleware (runs before http-proxy-middleware, can be async)
 export { createPreprocessorMiddleware } from "./preprocess";
+export { checkContextSize } from "./check-context-size";
 export { setApiFormat } from "./set-api-format";
 export { transformOutboundPayload } from "./transform-outbound-payload";

@@ -14,7 +15,6 @@ export { blockZoomerOrigins } from "./block-zoomer-origins";
 export { finalizeBody } from "./finalize-body";
 export { languageFilter } from "./language-filter";
 export { limitCompletions } from "./limit-completions";
-export { limitOutputTokens } from "./limit-output-tokens";
 export { removeOriginHeaders } from "./remove-origin-headers";
 export { transformKoboldPayload } from "./transform-kobold-payload";

@@ -1,46 +0,0 @@
-import { Request } from "express";
-import { config } from "../../../config";
-import { isCompletionRequest } from "../common";
-import { ProxyRequestMiddleware } from ".";
-
-/** Enforce a maximum number of tokens requested from the model. */
-export const limitOutputTokens: ProxyRequestMiddleware = (_proxyReq, req) => {
-  // TODO: do all of this shit in the zod validator
-  if (isCompletionRequest(req)) {
-    const requestedMax = Number.parseInt(getMaxTokensFromRequest(req));
-    const apiMax =
-      req.outboundApi === "openai"
-        ? config.maxOutputTokensOpenAI
-        : config.maxOutputTokensAnthropic;
-    let maxTokens = requestedMax;
-
-    if (typeof requestedMax !== "number") {
-      maxTokens = apiMax;
-    }
-
-    maxTokens = Math.min(maxTokens, apiMax);
-    if (req.outboundApi === "openai") {
-      req.body.max_tokens = maxTokens;
-    } else if (req.outboundApi === "anthropic") {
-      req.body.max_tokens_to_sample = maxTokens;
-    }
-
-    if (requestedMax !== maxTokens) {
-      req.log.info(
-        { requestedMax, configMax: apiMax, final: maxTokens },
-        "Limiting user's requested max output tokens"
-      );
-    }
-  }
-};
-
-function getMaxTokensFromRequest(req: Request) {
-  switch (req.outboundApi) {
-    case "anthropic":
-      return req.body?.max_tokens_to_sample;
-    case "openai":
-      return req.body?.max_tokens;
-    default:
-      throw new Error(`Unknown service: ${req.outboundApi}`);
-  }
-}
@@ -1,6 +1,11 @@
 import { RequestHandler } from "express";
 import { handleInternalError } from "../common";
-import { RequestPreprocessor, setApiFormat, transformOutboundPayload } from ".";
+import {
+  RequestPreprocessor,
+  checkContextSize,
+  setApiFormat,
+  transformOutboundPayload,
+} from ".";

 /**
 * Returns a middleware function that processes the request body into the given
@@ -13,6 +18,7 @@ export const createPreprocessorMiddleware = (
  const preprocessors: RequestPreprocessor[] = [
    setApiFormat(apiFormat),
    transformOutboundPayload,
+    checkContextSize,
    ...(additionalPreprocessors ?? []),
  ];

@@ -1,8 +1,12 @@
 import { Request } from "express";
 import { z } from "zod";
+import { config } from "../../../config";
+import { OpenAIPromptMessage } from "../../../tokenization";
 import { isCompletionRequest } from "../common";
 import { RequestPreprocessor } from ".";
-// import { countTokens } from "../../../tokenization";
+
+const CLAUDE_OUTPUT_MAX = config.maxOutputTokensAnthropic;
+const OPENAI_OUTPUT_MAX = config.maxOutputTokensOpenAI;

 // https://console.anthropic.com/docs/api/reference#-v1-complete
 const AnthropicV1CompleteSchema = z.object({
@@ -11,7 +15,10 @@ const AnthropicV1CompleteSchema = z.object({
    required_error:
      "No prompt found. Are you sending an OpenAI-formatted request to the Claude endpoint?",
  }),
-  max_tokens_to_sample: z.coerce.number(),
+  max_tokens_to_sample: z.coerce
+    .number()
+    .int()
+    .transform((v) => Math.min(v, CLAUDE_OUTPUT_MAX)),
  stop_sequences: z.array(z.string()).optional(),
  stream: z.boolean().optional().default(false),
  temperature: z.coerce.number().optional().default(1),
@@ -32,6 +39,8 @@ const OpenAIV1ChatCompletionSchema = z.object({
    {
      required_error:
        "No prompt found. Are you sending an Anthropic-formatted request to the OpenAI endpoint?",
+      invalid_type_error:
+        "Messages were not formatted correctly. Refer to the OpenAI Chat API documentation for more information.",
    }
  ),
  temperature: z.number().optional().default(1),
@@ -45,7 +54,12 @@ const OpenAIV1ChatCompletionSchema = z.object({
    .optional(),
  stream: z.boolean().optional().default(false),
  stop: z.union([z.string(), z.array(z.string())]).optional(),
-  max_tokens: z.coerce.number().optional(),
+  max_tokens: z.coerce
+    .number()
+    .int()
+    .optional()
+    .default(16)
+    .transform((v) => Math.min(v, OPENAI_OUTPUT_MAX)),
  frequency_penalty: z.number().optional().default(0),
  presence_penalty: z.number().optional().default(0),
  logit_bias: z.any().optional(),
@@ -63,7 +77,6 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
  }

  if (sameService) {
-    // Just validate, don't transform.
    const validator =
      req.outboundApi === "openai"
        ? OpenAIV1ChatCompletionSchema
@@ -76,11 +89,12 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
      );
      throw result.error;
    }
+    req.body = result.data;
    return;
  }

  if (req.inboundApi === "openai" && req.outboundApi === "anthropic") {
-    req.body = openaiToAnthropic(req.body, req);
+    req.body = await openaiToAnthropic(req.body, req);
    return;
  }

@@ -89,7 +103,7 @@ export const transformOutboundPayload: RequestPreprocessor = async (req) => {
  );
 };

-function openaiToAnthropic(body: any, req: Request) {
+async function openaiToAnthropic(body: any, req: Request) {
  const result = OpenAIV1ChatCompletionSchema.safeParse(body);
  if (!result.success) {
    req.log.error(
@@ -107,37 +121,7 @@ function openaiToAnthropic(body: any, req: Request) {
  req.headers["anthropic-version"] = "2023-01-01";

  const { messages, ...rest } = result.data;
-  const prompt =
-    result.data.messages
-      .map((m) => {
-        let role: string = m.role;
-        if (role === "assistant") {
-          role = "Assistant";
-        } else if (role === "system") {
-          role = "System";
-        } else if (role === "user") {
-          role = "Human";
-        }
-        // https://console.anthropic.com/docs/prompt-design
-        // `name` isn't supported by Anthropic but we can still try to use it.
-        return `\n\n${role}: ${m.name?.trim() ? `(as ${m.name}) ` : ""}${
-          m.content
-        }`;
-      })
-      .join("") + "\n\nAssistant: ";
-
-  // No longer defaulting to `claude-v1.2` because it seems to be in the process
-  // of being deprecated. `claude-v1` is the new default.
-  // If you have keys that can still use `claude-v1.2`, you can set the
-  // CLAUDE_BIG_MODEL and CLAUDE_SMALL_MODEL environment variables in your .env
-  // file.
-
-  const CLAUDE_BIG = process.env.CLAUDE_BIG_MODEL || "claude-v1-100k";
-  const CLAUDE_SMALL = process.env.CLAUDE_SMALL_MODEL || "claude-v1";
-
-  // TODO: Finish implementing tokenizer for more accurate model selection.
-  // This currently uses _character count_, not token count.
-  const model = prompt.length > 25000 ? CLAUDE_BIG : CLAUDE_SMALL;
+  const prompt = openAIMessagesToClaudePrompt(messages);

  let stops = rest.stop
    ? Array.isArray(rest.stop)
@@ -154,9 +138,35 @@ function openaiToAnthropic(body: any, req: Request) {

  return {
    ...rest,
-    model,
+    // Model may be overridden in `calculate-context-size.ts` to avoid having
+    // a circular dependency (`calculate-context-size.ts` needs an already-
+    // transformed request body to count tokens, but this function would like
+    // to know the count to select a model).
+    model: process.env.CLAUDE_SMALL_MODEL || "claude-v1",
    prompt: prompt,
    max_tokens_to_sample: rest.max_tokens,
    stop_sequences: stops,
  };
 }
+
+export function openAIMessagesToClaudePrompt(messages: OpenAIPromptMessage[]) {
+  return (
+    messages
+      .map((m) => {
+        let role: string = m.role;
+        if (role === "assistant") {
+          role = "Assistant";
+        } else if (role === "system") {
+          role = "System";
+        } else if (role === "user") {
+          role = "Human";
+        }
+        // https://console.anthropic.com/docs/prompt-design
+        // `name` isn't supported by Anthropic but we can still try to use it.
+        return `\n\n${role}: ${m.name?.trim() ? `(as ${m.name}) ` : ""}${
+          m.content
+        }`;
+      })
+      .join("") + "\n\nAssistant:"
+  );
+}