Files
simple-proxy/src/proxy/middleware/response/index.ts
T
2023-05-13 00:58:15 +00:00

402 lines
13 KiB
TypeScript

import { Request, Response } from "express";
import * as http from "http";
import util from "util";
import zlib from "zlib";
import * as httpProxy from "http-proxy";
import { config } from "../../../config";
import { logger } from "../../../logger";
import { keyPool } from "../../../key-management";
import { buildFakeSseMessage, enqueue, trackWaitTime } from "../../queue";
import { handleStreamedResponse } from "./handle-streamed-response";
import { logPrompt } from "./log-prompt";
import { incrementPromptCount } from "../../auth/user-store";
export const QUOTA_ROUTES = ["/v1/chat/completions"];
const DECODER_MAP = {
gzip: util.promisify(zlib.gunzip),
deflate: util.promisify(zlib.inflate),
br: util.promisify(zlib.brotliDecompress),
};
const isSupportedContentEncoding = (
contentEncoding: string
): contentEncoding is keyof typeof DECODER_MAP => {
return contentEncoding in DECODER_MAP;
};
class RetryableError extends Error {
constructor(message: string) {
super(message);
this.name = "RetryableError";
}
}
/**
* Either decodes or streams the entire response body and then passes it as the
* last argument to the rest of the middleware stack.
*/
export type RawResponseBodyHandler = (
proxyRes: http.IncomingMessage,
req: Request,
res: Response
) => Promise<string | Record<string, any>>;
export type ProxyResHandlerWithBody = (
proxyRes: http.IncomingMessage,
req: Request,
res: Response,
/**
* This will be an object if the response content-type is application/json,
* or if the response is a streaming response. Otherwise it will be a string.
*/
body: string | Record<string, any>
) => Promise<void>;
export type ProxyResMiddleware = ProxyResHandlerWithBody[];
/**
* Returns a on.proxyRes handler that executes the given middleware stack after
* the common proxy response handlers have processed the response and decoded
* the body. Custom middleware won't execute if the response is determined to
* be an error from the upstream service as the response will be taken over by
* the common error handler.
*
* For streaming responses, the handleStream middleware will block remaining
* middleware from executing as it consumes the stream and forwards events to
* the client. Once the stream is closed, the finalized body will be attached
* to res.body and the remaining middleware will execute.
*/
export const createOnProxyResHandler = (apiMiddleware: ProxyResMiddleware) => {
return async (
proxyRes: http.IncomingMessage,
req: Request,
res: Response
) => {
const initialHandler = req.isStreaming
? handleStreamedResponse
: decodeResponseBody;
let lastMiddlewareName = initialHandler.name;
try {
const body = await initialHandler(proxyRes, req, res);
const middlewareStack: ProxyResMiddleware = [];
if (req.isStreaming) {
// `handleStreamedResponse` writes to the response and ends it, so
// we can only execute middleware that doesn't write to the response.
middlewareStack.push(trackRateLimit, incrementKeyUsage, logPrompt);
} else {
middlewareStack.push(
trackRateLimit,
handleUpstreamErrors,
incrementKeyUsage,
copyHttpHeaders,
logPrompt,
...apiMiddleware
);
}
for (const middleware of middlewareStack) {
lastMiddlewareName = middleware.name;
await middleware(proxyRes, req, res, body);
}
trackWaitTime(req);
} catch (error: any) {
// Hack: if the error is a retryable rate-limit error, the request has
// been re-enqueued and we can just return without doing anything else.
if (error instanceof RetryableError) {
return;
}
const errorData = {
error: error.stack,
thrownBy: lastMiddlewareName,
key: req.key?.hash,
};
const message = `Error while executing proxy response middleware: ${lastMiddlewareName} (${error.message})`;
if (res.headersSent) {
req.log.error(errorData, message);
// This should have already been handled by the error handler, but
// just in case...
if (!res.writableEnded) {
res.end();
}
return;
}
logger.error(errorData, message);
res
.status(500)
.json({ error: "Internal server error", proxy_note: message });
}
};
};
function reenqueueRequest(req: Request) {
req.log.info(
{ key: req.key?.hash, retryCount: req.retryCount },
`Re-enqueueing request due to rate-limit error`
);
req.retryCount++;
enqueue(req);
}
/**
* Handles the response from the upstream service and decodes the body if
* necessary. If the response is JSON, it will be parsed and returned as an
* object. Otherwise, it will be returned as a string.
* @throws {Error} Unsupported content-encoding or invalid application/json body
*/
export const decodeResponseBody: RawResponseBodyHandler = async (
proxyRes,
req,
res
) => {
if (req.isStreaming) {
req.log.error(
{ api: req.api, key: req.key?.hash },
`decodeResponseBody called for a streaming request, which isn't valid.`
);
throw new Error("decodeResponseBody called for a streaming request.");
}
const promise = new Promise<string>((resolve, reject) => {
let chunks: Buffer[] = [];
proxyRes.on("data", (chunk) => chunks.push(chunk));
proxyRes.on("end", async () => {
let body = Buffer.concat(chunks);
const contentEncoding = proxyRes.headers["content-encoding"];
if (contentEncoding) {
if (isSupportedContentEncoding(contentEncoding)) {
const decoder = DECODER_MAP[contentEncoding];
body = await decoder(body);
} else {
const errorMessage = `Proxy received response with unsupported content-encoding: ${contentEncoding}`;
logger.warn({ contentEncoding, key: req.key?.hash }, errorMessage);
writeErrorResponse(res, 500, {
error: errorMessage,
contentEncoding,
});
return reject(errorMessage);
}
}
try {
if (proxyRes.headers["content-type"]?.includes("application/json")) {
const json = JSON.parse(body.toString());
return resolve(json);
}
return resolve(body.toString());
} catch (error: any) {
const errorMessage = `Proxy received response with invalid JSON: ${error.message}`;
logger.warn({ error, key: req.key?.hash }, errorMessage);
writeErrorResponse(res, 500, { error: errorMessage });
return reject(errorMessage);
}
});
});
return promise;
};
// TODO: This is too specific to OpenAI's error responses, Anthropic errors
// will need a different handler.
/**
* Handles non-2xx responses from the upstream service. If the proxied response
* is an error, this will respond to the client with an error payload and throw
* an error to stop the middleware stack.
* On 429 errors, if request queueing is enabled, the request will be silently
* re-enqueued. Otherwise, the request will be rejected with an error payload.
* @throws {Error} On HTTP error status code from upstream service
*/
const handleUpstreamErrors: ProxyResHandlerWithBody = async (
proxyRes,
req,
res,
body
) => {
const statusCode = proxyRes.statusCode || 500;
if (statusCode < 400) {
return;
}
let errorPayload: Record<string, any>;
// Subtract 1 from available keys because if this message is being shown,
// it's because the key is about to be disabled.
const availableKeys = keyPool.available() - 1;
const tryAgainMessage = Boolean(availableKeys)
? `There are ${availableKeys} more keys available; try your request again.`
: "There are no more keys available.";
try {
if (typeof body === "object") {
errorPayload = body;
} else {
throw new Error("Received unparsable error response from upstream.");
}
} catch (parseError: any) {
const statusMessage = proxyRes.statusMessage || "Unknown error";
// Likely Bad Gateway or Gateway Timeout from OpenAI's Cloudflare proxy
logger.warn(
{ statusCode, statusMessage, key: req.key?.hash },
parseError.message
);
const errorObject = {
statusCode,
statusMessage: proxyRes.statusMessage,
error: parseError.message,
proxy_note: `This is likely a temporary error with the upstream service.`,
};
writeErrorResponse(res, statusCode, errorObject);
throw new Error(parseError.message);
}
logger.warn(
{
statusCode,
type: errorPayload.error?.code,
errorPayload,
key: req.key?.hash,
},
`Received error response from upstream. (${proxyRes.statusMessage})`
);
if (statusCode === 400) {
// Bad request (likely prompt is too long)
errorPayload.proxy_note = `OpenAI rejected the request as invalid. Your prompt may be too long for ${req.body?.model}.`;
} else if (statusCode === 401) {
// Key is invalid or was revoked
keyPool.disable(req.key!);
errorPayload.proxy_note = `The OpenAI key is invalid or revoked. ${tryAgainMessage}`;
} else if (statusCode === 429) {
const type = errorPayload.error?.type;
if (type === "insufficient_quota") {
// Billing quota exceeded (key is dead, disable it)
keyPool.disable(req.key!);
errorPayload.proxy_note = `Assigned key's quota has been exceeded. ${tryAgainMessage}`;
} else if (type === "billing_not_active") {
// Billing is not active (key is dead, disable it)
keyPool.disable(req.key!);
errorPayload.proxy_note = `Assigned key was deactivated by OpenAI. ${tryAgainMessage}`;
} else if (type === "requests" || type === "tokens") {
// Per-minute request or token rate limit is exceeded, which we can retry
keyPool.markRateLimited(req.key!.hash);
if (config.queueMode !== "none") {
reenqueueRequest(req);
// TODO: I don't like using an error to control flow here
throw new RetryableError("Rate-limited request re-enqueued.");
}
errorPayload.proxy_note = `Assigned key's '${type}' rate limit has been exceeded. Try again later.`;
} else {
// OpenAI probably overloaded
errorPayload.proxy_note = `This is likely a temporary error with OpenAI. Try again in a few seconds.`;
}
} else if (statusCode === 404) {
// Most likely model not found
// TODO: this probably doesn't handle GPT-4-32k variants properly if the
// proxy has keys for both the 8k and 32k context models at the same time.
if (errorPayload.error?.code === "model_not_found") {
if (req.key!.isGpt4) {
errorPayload.proxy_note = `Assigned key isn't provisioned for the GPT-4 snapshot you requested. Try again to get a different key, or use Turbo.`;
} else {
errorPayload.proxy_note = `No model was found for this key.`;
}
}
} else {
errorPayload.proxy_note = `Unrecognized error from OpenAI.`;
}
// Some OAI errors contain the organization ID, which we don't want to reveal.
if (errorPayload.error?.message) {
errorPayload.error.message = errorPayload.error.message.replace(
/org-.{24}/gm,
"org-xxxxxxxxxxxxxxxxxxx"
);
}
writeErrorResponse(res, statusCode, errorPayload);
throw new Error(errorPayload.error?.message);
};
function writeErrorResponse(
res: Response,
statusCode: number,
errorPayload: Record<string, any>
) {
// If we're mid-SSE stream, send a data event with the error payload and end
// the stream. Otherwise just send a normal error response.
if (
res.headersSent ||
res.getHeader("content-type") === "text/event-stream"
) {
const msg = buildFakeSseMessage(
`upstream error (${statusCode})`,
JSON.stringify(errorPayload, null, 2)
);
res.write(msg);
res.write(`data: [DONE]\n\n`);
res.end();
} else {
res.status(statusCode).json(errorPayload);
}
}
/** Handles errors in rewriter pipelines. */
export const handleInternalError: httpProxy.ErrorCallback = (
err,
_req,
res
) => {
logger.error({ error: err }, "Error in http-proxy-middleware pipeline.");
try {
writeErrorResponse(res as Response, 500, {
error: {
type: "proxy_error",
message: err.message,
stack: err.stack,
proxy_note: `Reverse proxy encountered an error before it could reach the upstream API.`,
},
});
} catch (e) {
logger.error(
{ error: e },
`Error writing error response headers, giving up.`
);
}
};
const incrementKeyUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
if (QUOTA_ROUTES.includes(req.path)) {
keyPool.incrementPrompt(req.key?.hash);
if (req.user) {
incrementPromptCount(req.user.token);
}
}
};
const trackRateLimit: ProxyResHandlerWithBody = async (proxyRes, req) => {
keyPool.updateRateLimits(req.key!.hash, proxyRes.headers);
};
const copyHttpHeaders: ProxyResHandlerWithBody = async (
proxyRes,
_req,
res
) => {
Object.keys(proxyRes.headers).forEach((key) => {
// Omit content-encoding because we will always decode the response body
if (key === "content-encoding") {
return;
}
// We're usually using res.json() to send the response, which causes express
// to set content-length. That's not valid for chunked responses and some
// clients will reject it so we need to omit it.
if (key === "transfer-encoding") {
return;
}
res.setHeader(key, proxyRes.headers[key] as string);
});
};