removes QUEUE_MODE config (now always enabled)

2023-08-09 18:20:19 -05:00
parent 5d3fb6af3a
commit 6bb67281d9
7 changed files with 38 additions and 92 deletions
@@ -33,12 +33,6 @@ const rewriteRequest = (
  req: Request,
  res: Response
 ) => {
-  if (config.queueMode !== "none") {
-    const msg = `Queueing is enabled on this proxy instance and is incompatible with the KoboldAI endpoint. Use the OpenAI endpoint instead.`;
-    proxyReq.destroy(new Error(msg));
-    return;
-  }
-
  req.body.stream = false;
  const rewriterPipeline = [
    addKey,
@@ -341,11 +341,8 @@ function maybeHandleMissingPreambleError(
      "Request failed due to missing preamble. Key will be marked as such for subsequent requests."
    );
    keyPool.update(req.key!, { requiresPreamble: true });
-    if (config.queueMode !== "none") {
-      reenqueueRequest(req);
-      throw new RetryableError("Claude request re-enqueued to add preamble.");
-    }
-    errorPayload.proxy_note = `This Claude key requires special prompt formatting. Try again; the proxy will reformat your prompt next time.`;
+    reenqueueRequest(req);
+    throw new RetryableError("Claude request re-enqueued to add preamble.");
  } else {
    errorPayload.proxy_note = `Proxy received unrecognized error from Anthropic. Check the specific error for more information.`;
  }
@@ -357,11 +354,8 @@ function handleAnthropicRateLimitError(
 ) {
  if (errorPayload.error?.type === "rate_limit_error") {
    keyPool.markRateLimited(req.key!);
-    if (config.queueMode !== "none") {
-      reenqueueRequest(req);
-      throw new RetryableError("Claude rate-limited request re-enqueued.");
-    }
-    errorPayload.proxy_note = `There are too many in-flight requests for this key. Try again later.`;
+    reenqueueRequest(req);
+    throw new RetryableError("Claude rate-limited request re-enqueued.");
  } else {
    errorPayload.proxy_note = `Unrecognized rate limit error from Anthropic. Key may be over quota.`;
  }
@@ -388,13 +382,11 @@ function handleOpenAIRateLimitError(
  } else if (type === "requests" || type === "tokens") {
    // Per-minute request or token rate limit is exceeded, which we can retry
    keyPool.markRateLimited(req.key!);
-    if (config.queueMode !== "none") {
-      reenqueueRequest(req);
-      // This is confusing, but it will bubble up to the top-level response
-      // handler and cause the request to go back into the request queue.
-      throw new RetryableError("Rate-limited request re-enqueued.");
-    }
-    errorPayload.proxy_note = `Assigned key's '${type}' rate limit has been exceeded. Try again later.`;
+    // I'm aware this is confusing -- throwing this class of error will cause
+    // the proxy response handler to return without terminating the request,
+    // so that it can be placed back in the queue.
+    reenqueueRequest(req);
+    throw new RetryableError("Rate-limited request re-enqueued.");
  } else {
    // OpenAI probably overloaded
    errorPayload.proxy_note = `This is likely a temporary error with OpenAI. Try again in a few seconds.`;
@@ -16,7 +16,6 @@
 */

 import type { Handler, Request } from "express";
-import { config, DequeueMode } from "../config";
 import { keyPool, SupportedModel } from "../key-management";
 import { logger } from "../logger";
 import { AGNAI_DOT_CHAT_IP } from "./rate-limit";
@@ -27,8 +26,6 @@ export type QueuePartition = "claude" | "turbo" | "gpt-4";
 const queue: Request[] = [];
 const log = logger.child({ module: "request-queue" });

-let dequeueMode: DequeueMode = "fair";
-
 /** Maximum number of queue slots for Agnai.chat requests. */
 const AGNAI_CONCURRENCY_LIMIT = 15;
 /** Maximum number of queue slots for individual users. */
@@ -160,18 +157,9 @@ export function dequeue(partition: QueuePartition): Request | undefined {
    return undefined;
  }

-  let req: Request;
-
-  if (dequeueMode === "fair") {
-    // Dequeue the request that has been waiting the longest
-    req = modelQueue.reduce((prev, curr) =>
-      prev.startTime < curr.startTime ? prev : curr
-    );
-  } else {
-    // Dequeue a random request
-    const index = Math.floor(Math.random() * modelQueue.length);
-    req = modelQueue[index];
-  }
+  const req = modelQueue.reduce((prev, curr) =>
+    prev.startTime < curr.startTime ? prev : curr
+  );
  queue.splice(queue.indexOf(req), 1);

  if (req.onAborted) {
@@ -293,10 +281,6 @@ export function getQueueLength(partition: QueuePartition | "all" = "all") {

 export function createQueueMiddleware(proxyMiddleware: Handler): Handler {
  return (req, res, next) => {
-    if (config.queueMode === "none") {
-      return proxyMiddleware(req, res, next);
-    }
-
    req.proceed = () => {
      proxyMiddleware(req, res, next);
    };