Implement AWS Bedrock support (khanon/oai-reverse-proxy!45)

2023-10-01 01:40:18 +00:00
parent 7e681a7bef
commit fa4bf468d2
38 changed files with 1438 additions and 410 deletions
@@ -80,7 +80,6 @@ export const addKey: ProxyRequestMiddleware = (proxyReq, req) => {
      proxyReq.setHeader("X-API-Key", assignedKey.key);
      break;
    case "openai":
-    case "openai-text":
      const key: OpenAIKey = assignedKey as OpenAIKey;
      if (key.organizationId) {
        proxyReq.setHeader("OpenAI-Organization", key.organizationId);
@@ -94,6 +93,10 @@ export const addKey: ProxyRequestMiddleware = (proxyReq, req) => {
        `?key=${assignedKey.key}`
      );
      break;
+    case "aws":
+      throw new Error(
+        "add-key should not be used for AWS security credentials. Use sign-aws-request instead."
+      );
    default:
      assertNever(assignedKey.service);
  }
@@ -0,0 +1,48 @@
+import { RequestPreprocessor } from "./index";
+import { countTokens, OpenAIPromptMessage } from "../../../shared/tokenization";
+import { assertNever } from "../../../shared/utils";
+
+/**
+ * Given a request with an already-transformed body, counts the number of
+ * tokens and assigns the count to the request.
+ */
+export const countPromptTokens: RequestPreprocessor = async (req) => {
+  const service = req.outboundApi;
+  let result;
+
+  switch (service) {
+    case "openai": {
+      req.outputTokens = req.body.max_tokens;
+      const prompt: OpenAIPromptMessage[] = req.body.messages;
+      result = await countTokens({ req, prompt, service });
+      break;
+    }
+    case "openai-text": {
+      req.outputTokens = req.body.max_tokens;
+      const prompt: string = req.body.prompt;
+      result = await countTokens({ req, prompt, service });
+      break;
+    }
+    case "anthropic": {
+      req.outputTokens = req.body.max_tokens_to_sample;
+      const prompt: string = req.body.prompt;
+      result = await countTokens({ req, prompt, service });
+      break;
+    }
+    case "google-palm": {
+      req.outputTokens = req.body.maxOutputTokens;
+      const prompt: string = req.body.prompt.text;
+      result = await countTokens({ req, prompt, service });
+      break;
+    }
+    default:
+      assertNever(service);
+  }
+
+  req.promptTokens = result.token_count;
+
+  // TODO: Remove once token counting is stable
+  req.log.debug({ result: result }, "Counted prompt tokens.");
+  req.debug = req.debug ?? {};
+  req.debug = { ...req.debug, ...result };
+};
@@ -0,0 +1,26 @@
+import type { ProxyRequestMiddleware } from ".";
+
+/**
+ * For AWS requests, the body is signed earlier in the request pipeline, before
+ * the proxy middleware. This function just assigns the path and headers to the
+ * proxy request.
+ */
+export const finalizeAwsRequest: ProxyRequestMiddleware = (proxyReq, req) => {
+  if (!req.signedRequest) {
+    throw new Error("Expected req.signedRequest to be set");
+  }
+
+  // The path depends on the selected model and the assigned key's region.
+  proxyReq.path = req.signedRequest.path;
+
+  // Amazon doesn't want extra headers, so we need to remove all of them and
+  // reassign only the ones specified in the signed request.
+  proxyReq.getRawHeaderNames().forEach(proxyReq.removeHeader.bind(proxyReq));
+  Object.entries(req.signedRequest.headers).forEach(([key, value]) => {
+    proxyReq.setHeader(key, value);
+  });
+
+  // Don't use fixRequestBody here because it adds a content-length header.
+  // Amazon doesn't want that and it breaks the signature.
+  proxyReq.write(req.signedRequest.body);
+};
@@ -2,14 +2,17 @@ import type { Request } from "express";
 import type { ClientRequest } from "http";
 import type { ProxyReqCallback } from "http-proxy";

-// Express middleware (runs before http-proxy-middleware, can be async)
-export { applyQuotaLimits } from "./apply-quota-limits";
 export {
  createPreprocessorMiddleware,
  createEmbeddingsPreprocessorMiddleware,
 } from "./preprocess";
-export { checkContextSize } from "./check-context-size";
+
+// Express middleware (runs before http-proxy-middleware, can be async)
+export { applyQuotaLimits } from "./apply-quota-limits";
+export { validateContextSize } from "./validate-context-size";
+export { countPromptTokens } from "./count-prompt-tokens";
 export { setApiFormat } from "./set-api-format";
+export { signAwsRequest } from "./sign-aws-request";
 export { transformOutboundPayload } from "./transform-outbound-payload";

 // HPM middleware (runs on onProxyReq, cannot be async)
@@ -17,6 +20,7 @@ export { addKey, addKeyForEmbeddingsRequest } from "./add-key";
 export { addAnthropicPreamble } from "./add-anthropic-preamble";
 export { blockZoomerOrigins } from "./block-zoomer-origins";
 export { finalizeBody } from "./finalize-body";
+export { finalizeAwsRequest } from "./finalize-aws-request";
 export { languageFilter } from "./language-filter";
 export { limitCompletions } from "./limit-completions";
 export { stripHeaders } from "./strip-headers";
@@ -50,3 +54,6 @@ export type RequestPreprocessor = (req: Request) => void | Promise<void>;
 * request queue middleware.
 */
 export type ProxyRequestMiddleware = ProxyReqCallback<ClientRequest, Request>;
+
+export const forceModel = (model: string) => (req: Request) =>
+  void (req.body.model = model);
@@ -2,24 +2,42 @@ import { RequestHandler } from "express";
 import { handleInternalError } from "../common";
 import {
  RequestPreprocessor,
-  checkContextSize,
+  validateContextSize,
+  countPromptTokens,
  setApiFormat,
  transformOutboundPayload,
 } from ".";

+type RequestPreprocessorOptions = {
+  /**
+   * Functions to run before the request body is transformed between API
+   * formats. Use this to change the behavior of the transformation, such as for
+   * endpoints which can accept multiple API formats.
+   */
+  beforeTransform?: RequestPreprocessor[];
+  /**
+   * Functions to run after the request body is transformed and token counts are
+   * assigned. Use this to perform validation or other actions that depend on
+   * the request body being in the final API format.
+   */
+  afterTransform?: RequestPreprocessor[];
+};
+
 /**
 * Returns a middleware function that processes the request body into the given
 * API format, and then sequentially runs the given additional preprocessors.
 */
 export const createPreprocessorMiddleware = (
  apiFormat: Parameters<typeof setApiFormat>[0],
-  additionalPreprocessors?: RequestPreprocessor[]
+  { beforeTransform, afterTransform }: RequestPreprocessorOptions = {}
 ): RequestHandler => {
  const preprocessors: RequestPreprocessor[] = [
    setApiFormat(apiFormat),
-    ...(additionalPreprocessors ?? []),
+    ...(beforeTransform ?? []),
    transformOutboundPayload,
-    checkContextSize,
+    countPromptTokens,
+    ...(afterTransform ?? []),
+    validateContextSize,
  ];
  return async (...args) => executePreprocessors(preprocessors, args);
 };
@@ -29,13 +47,10 @@ export const createPreprocessorMiddleware = (
 * OpenAI's embeddings API. Tokens are not counted because embeddings requests
 * are basically free.
 */
-export const createEmbeddingsPreprocessorMiddleware = (
-  additionalPreprocessors?: RequestPreprocessor[]
-): RequestHandler => {
+export const createEmbeddingsPreprocessorMiddleware = (): RequestHandler => {
  const preprocessors: RequestPreprocessor[] = [
-    setApiFormat({ inApi: "openai", outApi: "openai" }),
+    setApiFormat({ inApi: "openai", outApi: "openai", service: "openai" }),
    (req) => void (req.promptTokens = req.outputTokens = 0),
-    ...(additionalPreprocessors ?? []),
  ];
  return async (...args) => executePreprocessors(preprocessors, args);
 };
@@ -1,13 +1,15 @@
 import { Request } from "express";
-import { APIFormat } from "../../../shared/key-management";
+import { APIFormat, LLMService } from "../../../shared/key-management";
 import { RequestPreprocessor } from ".";

 export const setApiFormat = (api: {
  inApi: Request["inboundApi"];
  outApi: APIFormat;
+  service: LLMService,
 }): RequestPreprocessor => {
  return (req) => {
    req.inboundApi = api.inApi;
    req.outboundApi = api.outApi;
+    req.service = api.service;
  };
 };
@@ -0,0 +1,93 @@
+import express from "express";
+import { Sha256 } from "@aws-crypto/sha256-js";
+import { SignatureV4 } from "@smithy/signature-v4";
+import { HttpRequest } from "@smithy/protocol-http";
+import { keyPool } from "../../../shared/key-management";
+import { RequestPreprocessor } from ".";
+import { AnthropicV1CompleteSchema } from "./transform-outbound-payload";
+
+const AMZ_HOST =
+  process.env.AMZ_HOST || "invoke-bedrock.%REGION%.amazonaws.com";
+
+/**
+ * Signs an outgoing AWS request with the appropriate headers modifies the
+ * request object in place to fix the path.
+ */
+export const signAwsRequest: RequestPreprocessor = async (req) => {
+  req.key = keyPool.get("anthropic.claude-v2");
+
+  const { model, stream } = req.body;
+  req.isStreaming = stream === true || stream === "true";
+
+  let preamble = req.body.prompt.startsWith("\n\nHuman:") ? "" : "\n\nHuman:";
+  req.body.prompt = preamble + req.body.prompt;
+
+  // AWS supports only a subset of Anthropic's parameters and is more strict
+  // about unknown parameters.
+  // TODO: This should happen in transform-outbound-payload.ts
+  const strippedParams = AnthropicV1CompleteSchema.pick({
+    prompt: true,
+    max_tokens_to_sample: true,
+    stop_sequences: true,
+    temperature: true,
+    top_k: true,
+    top_p: true,
+  }).parse(req.body);
+
+  const credential = getCredentialParts(req);
+  const host = AMZ_HOST.replace("%REGION%", credential.region);
+
+  // Uses the AWS SDK to sign a request, then modifies our HPM proxy request
+  // with the headers generated by the SDK.
+  const newRequest = new HttpRequest({
+    method: "POST",
+    protocol: "https:",
+    hostname: host,
+    path: `/model/${model}/invoke${stream ? "-with-response-stream" : ""}`,
+    headers: {
+      ["Host"]: host,
+      ["content-type"]: "application/json",
+    },
+    body: JSON.stringify(strippedParams),
+  });
+
+  if (stream) {
+    newRequest.headers["x-amzn-bedrock-accept"] = "application/json";
+  } else {
+    newRequest.headers["accept"] = "*/*";
+  }
+
+  req.signedRequest = await sign(newRequest, getCredentialParts(req));
+};
+
+type Credential = {
+  accessKeyId: string;
+  secretAccessKey: string;
+  region: string;
+};
+function getCredentialParts(req: express.Request): Credential {
+  const [accessKeyId, secretAccessKey, region] = req.key!.key.split(":");
+
+  if (!accessKeyId || !secretAccessKey || !region) {
+    req.log.error(
+      { key: req.key!.hash },
+      "AWS_CREDENTIALS isn't correctly formatted; refer to the docs"
+    );
+    throw new Error("The key assigned to this request is invalid.");
+  }
+
+  return { accessKeyId, secretAccessKey, region };
+}
+
+async function sign(request: HttpRequest, credential: Credential) {
+  const { accessKeyId, secretAccessKey, region } = credential;
+
+  const signer = new SignatureV4({
+    sha256: Sha256,
+    credentials: { accessKeyId, secretAccessKey },
+    region,
+    service: "bedrock",
+  });
+
+  return signer.sign(request);
+}
@@ -10,8 +10,8 @@ const CLAUDE_OUTPUT_MAX = config.maxOutputTokensAnthropic;
 const OPENAI_OUTPUT_MAX = config.maxOutputTokensOpenAI;

 // https://console.anthropic.com/docs/api/reference#-v1-complete
-const AnthropicV1CompleteSchema = z.object({
-  model: z.string().regex(/^claude-/, "Model must start with 'claude-'"),
+export const AnthropicV1CompleteSchema = z.object({
+  model: z.string(),
  prompt: z.string({
    required_error:
      "No prompt found. Are you sending an OpenAI-formatted request to the Claude endpoint?",
@@ -23,14 +23,14 @@ const AnthropicV1CompleteSchema = z.object({
  stop_sequences: z.array(z.string()).optional(),
  stream: z.boolean().optional().default(false),
  temperature: z.coerce.number().optional().default(1),
-  top_k: z.coerce.number().optional().default(-1),
-  top_p: z.coerce.number().optional().default(-1),
+  top_k: z.coerce.number().optional(),
+  top_p: z.coerce.number().optional(),
  metadata: z.any().optional(),
 });

 // https://platform.openai.com/docs/api-reference/chat/create
 const OpenAIV1ChatCompletionSchema = z.object({
-  model: z.string().regex(/^gpt/, "Model must start with 'gpt-'"),
+  model: z.string(),
  messages: z.array(
    z.object({
      role: z.enum(["system", "user", "assistant"]),
@@ -89,7 +89,7 @@ const OpenAIV1TextCompletionSchema = z

 // https://developers.generativeai.google/api/rest/generativelanguage/models/generateText
 const PalmV1GenerateTextSchema = z.object({
-  model: z.string().regex(/^\w+-bison-\d{3}$/),
+  model: z.string(),
  prompt: z.object({ text: z.string() }),
  temperature: z.number().optional(),
  maxOutputTokens: z.coerce
@@ -159,7 +159,7 @@ function openaiToAnthropic(req: Request) {
  const { body } = req;
  const result = OpenAIV1ChatCompletionSchema.safeParse(body);
  if (!result.success) {
-    req.log.error(
+    req.log.warn(
      { issues: result.error.issues, body },
      "Invalid OpenAI-to-Anthropic request"
    );
@@ -208,7 +208,7 @@ function openaiToOpenaiText(req: Request) {
  const { body } = req;
  const result = OpenAIV1ChatCompletionSchema.safeParse(body);
  if (!result.success) {
-    req.log.error(
+    req.log.warn(
      { issues: result.error.issues, body },
      "Invalid OpenAI-to-OpenAI-text request"
    );
@@ -227,8 +227,7 @@ function openaiToOpenaiText(req: Request) {
  stops = [...new Set(stops)];

  const transformed = { ...rest, prompt: prompt, stop: stops };
-  const validated = OpenAIV1TextCompletionSchema.parse(transformed);
-  return validated;
+  return OpenAIV1TextCompletionSchema.parse(transformed);
 }

 function openaiToPalm(req: Request): z.infer<typeof PalmV1GenerateTextSchema> {
@@ -238,7 +237,7 @@ function openaiToPalm(req: Request): z.infer<typeof PalmV1GenerateTextSchema> {
    model: "gpt-3.5-turbo",
  });
  if (!result.success) {
-    req.log.error(
+    req.log.warn(
      { issues: result.error.issues, body },
      "Invalid OpenAI-to-Palm request"
    );
@@ -1,9 +1,8 @@
 import { Request } from "express";
 import { z } from "zod";
 import { config } from "../../../config";
-import { OpenAIPromptMessage, countTokens } from "../../../shared/tokenization";
-import { RequestPreprocessor } from ".";
 import { assertNever } from "../../../shared/utils";
+import { RequestPreprocessor } from ".";

 const CLAUDE_MAX_CONTEXT = config.maxContextTokensAnthropic;
 const OPENAI_MAX_CONTEXT = config.maxContextTokensOpenAI;
@@ -16,51 +15,7 @@ const BISON_MAX_CONTEXT = 8100;
 * This preprocessor should run after any preprocessor that transforms the
 * request body.
 */
-export const checkContextSize: RequestPreprocessor = async (req) => {
-  const service = req.outboundApi;
-  let result;
-
-  switch (service) {
-    case "openai": {
-      req.outputTokens = req.body.max_tokens;
-      const prompt: OpenAIPromptMessage[] = req.body.messages;
-      result = await countTokens({ req, prompt, service });
-      break;
-    }
-    case "openai-text": {
-      req.outputTokens = req.body.max_tokens;
-      const prompt: string = req.body.prompt;
-      result = await countTokens({ req, prompt, service });
-      break;
-    }
-    case "anthropic": {
-      req.outputTokens = req.body.max_tokens_to_sample;
-      const prompt: string = req.body.prompt;
-      result = await countTokens({ req, prompt, service });
-      break;
-    }
-    case "google-palm": {
-      req.outputTokens = req.body.maxOutputTokens;
-      const prompt: string = req.body.prompt.text;
-      result = await countTokens({ req, prompt, service });
-      break;
-    }
-    default:
-      assertNever(service);
-  }
-
-  req.promptTokens = result.token_count;
-
-  // TODO: Remove once token counting is stable
-  req.log.debug({ result: result }, "Counted prompt tokens.");
-  req.debug = req.debug ?? {};
-  req.debug = { ...req.debug, ...result };
-
-  maybeTranslateOpenAIModel(req);
-  validateContextSize(req);
-};
-
-function validateContextSize(req: Request) {
+export const validateContextSize: RequestPreprocessor = async (req) => {
  assertRequestHasTokenCounts(req);
  const promptTokens = req.promptTokens;
  const outputTokens = req.outputTokens;
@@ -125,7 +80,7 @@ function validateContextSize(req: Request) {
  req.debug.completion_tokens = outputTokens;
  req.debug.max_model_tokens = modelMax;
  req.debug.max_proxy_tokens = proxyMax;
-}
+};

 function assertRequestHasTokenCounts(
  req: Request
@@ -137,27 +92,3 @@ function assertRequestHasTokenCounts(
    .nonstrict()
    .parse({ promptTokens: req.promptTokens, outputTokens: req.outputTokens });
 }
-
-/**
- * For OpenAI-to-Anthropic requests, users can't specify the model, so we need
- * to pick one based on the final context size. Ideally this would happen in
- * the `transformOutboundPayload` preprocessor, but we don't have the context
- * size at that point (and need a transformed body to calculate it).
- */
-function maybeTranslateOpenAIModel(req: Request) {
-  if (req.inboundApi !== "openai" || req.outboundApi !== "anthropic") {
-    return;
-  }
-
-  const bigModel = process.env.CLAUDE_BIG_MODEL || "claude-v1-100k";
-  const contextSize = req.promptTokens! + req.outputTokens!;
-
-  if (contextSize > 8500) {
-    req.log.debug(
-      { model: bigModel, contextSize },
-      "Using Claude 100k model for OpenAI-to-Anthropic request"
-    );
-    req.body.model = bigModel;
-  }
-  // Small model is the default already set in `transformOutboundPayload`
-}