OAI-Proxy/src/proxy/queue.ts

/**
 * Very scuffed request queue. OpenAI's GPT-4 keys have a very strict rate limit
 * of 40000 generated tokens per minute. We don't actually know how many tokens
 * a given key has generated, so our queue will simply retry requests that fail
 * with a non-billing related 429 over and over again until they succeed.
 *
 * Dequeueing can operate in one of two modes:
 * - 'fair': requests are dequeued in the order they were enqueued.
 * - 'random': requests are dequeued randomly, not really a queue at all.
 *
 * When a request to a proxied endpoint is received, we create a closure around
 * the call to http-proxy-middleware and attach it to the request. This allows
 * us to pause the request until we have a key available. Further, if the
 * proxied request encounters a retryable error, we can simply put the request
 * back in the queue and it will be retried later using the same closure.
 */

import type { Handler, Request } from "express";
import { config, DequeueMode } from "../config";
import { keyPool } from "../key-management";
import { logger } from "../logger";
import { AGNAI_DOT_CHAT_IP } from "./rate-limit";

const queue: Request[] = [];
const log = logger.child({ module: "request-queue" });

let dequeueMode: DequeueMode = "fair";

/** Maximum number of queue slots for Agnai.chat requests. */
const AGNAI_CONCURRENCY_LIMIT = 15;
/** Maximum number of queue slots for individual users. */
const USER_CONCURRENCY_LIMIT = 1;

const sameIpPredicate = (incoming: Request) => (queued: Request) =>
  queued.ip === incoming.ip;
const sameUserPredicate = (incoming: Request) => (queued: Request) => {
  const incomingUser = incoming.user ?? { token: incoming.ip };
  const queuedUser = queued.user ?? { token: queued.ip };
  return queuedUser.token === incomingUser.token;
};

export function enqueue(req: Request) {
  let enqueuedRequestCount = 0;
  let isGuest = req.user?.token === undefined;

  if (isGuest) {
    enqueuedRequestCount = queue.filter(sameIpPredicate(req)).length;
  } else {
    enqueuedRequestCount = queue.filter(sameUserPredicate(req)).length;
  }

  // All Agnai.chat requests come from the same IP, so we allow them to have
  // more spots in the queue. Can't make it unlimited because people will
  // intentionally abuse it.
  // Authenticated users always get a single spot in the queue.
  const maxConcurrentQueuedRequests =
    isGuest && req.ip === AGNAI_DOT_CHAT_IP
      ? AGNAI_CONCURRENCY_LIMIT
      : USER_CONCURRENCY_LIMIT;
  if (enqueuedRequestCount >= maxConcurrentQueuedRequests) {
    if (req.ip === AGNAI_DOT_CHAT_IP) {
      // Re-enqueued requests are not counted towards the limit since they
      // already made it through the queue once.
      if (req.retryCount === 0) {
        throw new Error("Too many agnai.chat requests are already queued");
      }
    } else {
      throw new Error("Your IP or token already has a request in the queue");
    }
  }

  queue.push(req);
  req.queueOutTime = 0;

  // shitty hack to remove hpm's event listeners on retried requests
  removeProxyMiddlewareEventListeners(req);

  // If the request opted into streaming, we need to register a heartbeat
  // handler to keep the connection alive while it waits in the queue. We
  // deregister the handler when the request is dequeued.
  if (req.body.stream) {
    const res = req.res!;
    if (!res.headersSent) {
      initStreaming(req);
    }
    req.heartbeatInterval = setInterval(() => {
      if (process.env.NODE_ENV === "production") {
        req.res!.write(": queue heartbeat\n\n");
      } else {
        req.log.info(`Sending heartbeat to request in queue.`);
        const avgWait = Math.round(getEstimatedWaitTime() / 1000);
        const currentDuration = Math.round((Date.now() - req.startTime) / 1000);
        const debugMsg = `queue length: ${queue.length}; elapsed time: ${currentDuration}s; avg wait: ${avgWait}s`;
        req.res!.write(buildFakeSseMessage("heartbeat", debugMsg));
      }
    }, 10000);
  }

  // Register a handler to remove the request from the queue if the connection
  // is aborted or closed before it is dequeued.
  const removeFromQueue = () => {
    req.log.info(`Removing aborted request from queue.`);
    const index = queue.indexOf(req);
    if (index !== -1) {
      queue.splice(index, 1);
    }
    if (req.heartbeatInterval) {
      clearInterval(req.heartbeatInterval);
    }
  };
  req.onAborted = removeFromQueue;
  req.res!.once("close", removeFromQueue);

  if (req.retryCount ?? 0 > 0) {
    req.log.info({ retries: req.retryCount }, `Enqueued request for retry.`);
  } else {
    req.log.info(`Enqueued new request.`);
  }
}

export function dequeue(model: string): Request | undefined {
  // TODO: This should be set by some middleware that checks the request body.
  const modelQueue =
    model === "gpt-4"
      ? queue.filter((req) => req.body.model?.startsWith("gpt-4"))
      : queue.filter((req) => !req.body.model?.startsWith("gpt-4"));

  if (modelQueue.length === 0) {
    return undefined;
  }

  let req: Request;

  if (dequeueMode === "fair") {
    // Dequeue the request that has been waiting the longest
    req = modelQueue.reduce((prev, curr) =>
      prev.startTime < curr.startTime ? prev : curr
    );
  } else {
    // Dequeue a random request
    const index = Math.floor(Math.random() * modelQueue.length);
    req = modelQueue[index];
  }
  queue.splice(queue.indexOf(req), 1);

  if (req.onAborted) {
    req.res!.off("close", req.onAborted);
    req.onAborted = undefined;
  }

  if (req.heartbeatInterval) {
    clearInterval(req.heartbeatInterval);
  }

  // Track the time leaving the queue now, but don't add it to the wait times
  // yet because we don't know if the request will succeed or fail. We track
  // the time now and not after the request succeeds because we don't want to
  // include the model processing time.
  req.queueOutTime = Date.now();
  return req;
}

/**
 * Naive way to keep the queue moving by continuously dequeuing requests. Not
 * ideal because it limits throughput but we probably won't have enough traffic
 * or keys for this to be a problem.  If it does we can dequeue multiple
 * per tick.
 **/
function processQueue() {
  // This isn't completely correct, because a key can service multiple models.
  // Currently if a key is locked out on one model it will also stop servicing
  // the others, because we only track one rate limit per key.
  const gpt4Lockout = keyPool.getLockoutPeriod("gpt-4");
  const turboLockout = keyPool.getLockoutPeriod("gpt-3.5-turbo");

  const reqs: (Request | undefined)[] = [];
  if (gpt4Lockout === 0) {
    reqs.push(dequeue("gpt-4"));
  }
  if (turboLockout === 0) {
    reqs.push(dequeue("gpt-3.5-turbo"));
  }

  reqs.filter(Boolean).forEach((req) => {
    if (req?.proceed) {
      req.log.info({ retries: req.retryCount }, `Dequeuing request.`);
      req.proceed();
    }
  });
  setTimeout(processQueue, 50);
}

/**
 * Kill stalled requests after 5 minutes, and remove tracked wait times after 2
 * minutes.
 **/
function cleanQueue() {
  const now = Date.now();
  const oldRequests = queue.filter(
    (req) => now - (req.startTime ?? now) > 5 * 60 * 1000
  );
  oldRequests.forEach((req) => {
    req.log.info(`Removing request from queue after 5 minutes.`);
    killQueuedRequest(req);
  });

  const index = waitTimes.findIndex(
    (waitTime) => now - waitTime.end > 300 * 1000
  );
  const removed = waitTimes.splice(0, index + 1);
  log.debug(
    { stalledRequests: oldRequests.length, prunedWaitTimes: removed.length },
    `Cleaning up request queue.`
  );
  setTimeout(cleanQueue, 20 * 1000);
}

export function start() {
  processQueue();
  cleanQueue();
  log.info(`Started request queue.`);
}

let waitTimes: { start: number; end: number }[] = [];

/** Adds a successful request to the list of wait times. */
export function trackWaitTime(req: Request) {
  waitTimes.push({
    start: req.startTime!,
    end: req.queueOutTime ?? Date.now(),
  });
}

/** Returns average wait time in milliseconds. */
export function getEstimatedWaitTime() {
  const now = Date.now();
  const recentWaits = waitTimes.filter((wt) => now - wt.end < 300 * 1000);
  if (recentWaits.length === 0) {
    return 0;
  }

  return (
    recentWaits.reduce((sum, wt) => sum + wt.end - wt.start, 0) /
    recentWaits.length
  );
}

export function getQueueLength() {
  return queue.length;
}

export function createQueueMiddleware(proxyMiddleware: Handler): Handler {
  return (req, res, next) => {
    if (config.queueMode === "none") {
      return proxyMiddleware(req, res, next);
    }

    req.proceed = () => {
      proxyMiddleware(req, res, next);
    };

    try {
      enqueue(req);
    } catch (err: any) {
      req.res!.status(429).json({
        type: "proxy_error",
        message: err.message,
        stack: err.stack,
        proxy_note: `Only one request per IP can be queued at a time. If you don't have another request queued, your IP may be in use by another user.`,
      });
    }
  };
}

function killQueuedRequest(req: Request) {
  if (!req.res || req.res.writableEnded) {
    req.log.warn(`Attempted to terminate request that has already ended.`);
    return;
  }
  const res = req.res;
  try {
    const message = `Your request has been terminated by the proxy because it has been in the queue for more than 5 minutes. The queue is currently ${queue.length} requests long.`;
    if (res.headersSent) {
      const fakeErrorEvent = buildFakeSseMessage("proxy queue error", message);
      res.write(fakeErrorEvent);
      res.end();
    } else {
      res.status(500).json({ error: message });
    }
  } catch (e) {
    req.log.error(e, `Error killing stalled request.`);
  }
}

function initStreaming(req: Request) {
  req.log.info(`Initiating streaming for new queued request.`);
  const res = req.res!;
  res.statusCode = 200;
  res.setHeader("Content-Type", "text/event-stream");
  res.setHeader("Cache-Control", "no-cache");
  res.setHeader("Connection", "keep-alive");
  res.setHeader("X-Accel-Buffering", "no"); // nginx-specific fix
  res.flushHeaders();
  res.write("\n");
  res.write(": joining queue\n\n");
}

export function buildFakeSseMessage(type: string, string: string) {
  const fakeEvent = {
    id: "chatcmpl-" + type,
    object: "chat.completion.chunk",
    created: Date.now(),
    model: "",
    choices: [
      {
        delta: { content: `[${type}: ${string}]\n` },
        index: 0,
        finish_reason: type,
      },
    ],
  };
  return `data: ${JSON.stringify(fakeEvent)}\n\n`;
}

/**
 * http-proxy-middleware attaches a bunch of event listeners to the req and
 * res objects which causes problems with our approach to re-enqueuing failed
 * proxied requests. This function removes those event listeners.
 * We don't have references to the original event listeners, so we have to
 * look through the list and remove HPM's listeners by looking for particular
 * strings in the listener functions. This is an astoundingly shitty way to do
 * this, but it's the best I can come up with.
 */
function removeProxyMiddlewareEventListeners(req: Request) {
  // node_modules/http-proxy-middleware/dist/plugins/default/debug-proxy-errors-plugin.js:29
  // res.listeners('close')
  const RES_ONCLOSE = `Destroying proxyRes in proxyRes close event`;
  // node_modules/http-proxy-middleware/dist/plugins/default/debug-proxy-errors-plugin.js:19
  // res.listeners('error')
  const RES_ONERROR = `Socket error in proxyReq event`;
  // node_modules/http-proxy/lib/http-proxy/passes/web-incoming.js:146
  // req.listeners('aborted')
  const REQ_ONABORTED = `proxyReq.abort()`;
  // node_modules/http-proxy/lib/http-proxy/passes/web-incoming.js:156
  // req.listeners('error')
  const REQ_ONERROR = `if (req.socket.destroyed`;

  const res = req.res!;

  const resOnClose = res
    .listeners("close")
    .find((listener) => listener.toString().includes(RES_ONCLOSE));
  if (resOnClose) {
    res.removeListener("close", resOnClose as any);
  }

  const resOnError = res
    .listeners("error")
    .find((listener) => listener.toString().includes(RES_ONERROR));
  if (resOnError) {
    res.removeListener("error", resOnError as any);
  }

  const reqOnAborted = req
    .listeners("aborted")
    .find((listener) => listener.toString().includes(REQ_ONABORTED));
  if (reqOnAborted) {
    req.removeListener("aborted", reqOnAborted as any);
  }

  const reqOnError = req
    .listeners("error")
    .find((listener) => listener.toString().includes(REQ_ONERROR));
  if (reqOnError) {
    req.removeListener("error", reqOnError as any);
  }
}