adds azure rate limit auto-retry

2023-12-04 01:23:55 -06:00
parent fbdea30264
commit fdd824f0e4
7 changed files with 88 additions and 55 deletions
@@ -17,9 +17,14 @@ AZURE_CREDENTIALS=contoso-ml:gpt4-8k:0123456789abcdef0123456789abcdef,northwind-
 ```
 ## Model assignment
-Note that each Azure deployment is assigned a model when you create it in the Microsoft Cognitive Services portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
+Note that each Azure deployment is assigned a model when you create it in the Azure OpenAI Service portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
 ### Supported model IDs
 Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID.
 As noted above, you can only use model IDs for which a deployment has been created and added to the proxy.
 ## On content filtering
 Be aware that all Azure OpenAI Service deployments have content filtering enabled by default at a Medium level. Prompts or responses which are deemed to be inappropriate will be rejected by the API. This is a feature of the Azure OpenAI Service and not the proxy.
 You can disable this from deployment's settings within Azure, but you would need to request an exemption from Microsoft for your organization first. See [this page](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/content-filters) for more information.
@@ -0,0 +1,44 @@
 const axios = require("axios");
 const concurrentRequests = 5;
 const headers = {
  Authorization: "Bearer test",
  "Content-Type": "application/json",
 };
 const payload = {
  model: "gpt-4",
  max_tokens: 1,
  stream: false,
  messages: [{ role: "user", content: "Hi" }],
 };
 const makeRequest = async (i) => {
  try {
    const response = await axios.post(
      "http://localhost:7860/proxy/azure/openai/v1/chat/completions",
      payload,
      { headers }
    );
    console.log(
      `Req ${i} finished with status code ${response.status} and response:`,
      response.data
    );
  } catch (error) {
    console.error(`Error in req ${i}:`, error.message);
  }
 };
 const executeRequestsConcurrently = () => {
  const promises = [];
  for (let i = 1; i <= concurrentRequests; i++) {
    console.log(`Starting request ${i}`);
    promises.push(makeRequest(i));
  }
  Promise.all(promises).then(() => {
    console.log("All requests finished");
  });
 };
 executeRequestsConcurrently();
@@ -1,40 +0,0 @@
 $NumThreads = 10
 $runspacePool = [runspacefactory]::CreateRunspacePool(1, $NumThreads)
 $runspacePool.Open()
 $runspaces = @()
 $headers = @{
    "Authorization" = "Bearer test"
    "anthropic-version" = "2023-01-01"
    "Content-Type" = "application/json"
 }
 $payload = @{
    model = "claude-v2"
    max_tokens_to_sample = 40
    temperature = 0
    stream = $true
    prompt = "Test prompt, please reply with lorem ipsum`n`n:Assistant:"
 } | ConvertTo-Json
 for ($i = 1; $i -le $NumThreads; $i++) {
    Write-Host "Starting thread $i"
    $runspace = [powershell]::Create()
    $runspace.AddScript({
        param($i, $headers, $payload)
        $response = Invoke-WebRequest -Uri "http://localhost:7860/proxy/aws/claude/v1/complete" -Method Post -Headers $headers -Body $payload
        Write-Host "Response from server: $($response.StatusCode)"
    }).AddArgument($i).AddArgument($headers).AddArgument($payload)
    $runspace.RunspacePool = $runspacePool
    $runspaces += [PSCustomObject]@{ Pipe = $runspace; Status = $runspace.BeginInvoke() }
 }
 $runspaces | ForEach-Object {
    $_.Pipe.EndInvoke($_.Status)
    $_.Pipe.Dispose()
 }
 $runspacePool.Close()
 $runspacePool.Dispose()
@@ -343,8 +343,10 @@ const handleUpstreamErrors: ProxyResHandlerWithBody = async (
      case "aws":
        handleAwsRateLimitError(req, errorPayload);
        break;
      case "google-palm":
      case "azure":
        handleAzureRateLimitError(req, errorPayload);
        break;
      case "google-palm":
        errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`;
        break;
      default:
@@ -507,6 +509,22 @@ function handleOpenAIRateLimitError(
  return errorPayload;
 }
 function handleAzureRateLimitError(
  req: Request,
  errorPayload: ProxiedErrorPayload
 ) {
  const code = errorPayload.error?.code;
  switch (code) {
    case "429":
      keyPool.markRateLimited(req.key!);
      reenqueueRequest(req);
      throw new RetryableError("Rate-limited request re-enqueued.");
    default:
      errorPayload.proxy_note = `Unrecognized rate limit error from Azure (${code}). Please report this.`;
      break;
  }
 }
 const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
  if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) {
    const model = req.body.model;
@@ -15,6 +15,8 @@ import crypto from "crypto";
 import type { Handler, Request } from "express";
 import { keyPool } from "../shared/key-management";
 import {
  getAwsBedrockModelFamily,
  getAzureOpenAIModelFamily,
  getClaudeModelFamily,
  getGooglePalmModelFamily,
  getOpenAIModelFamily,
@@ -136,11 +138,10 @@ function getPartitionForRequest(req: Request): ModelFamily {
  // they should be treated as separate queues.
  const model = req.body.model ?? "gpt-3.5-turbo";
-  // Weird special case for AWS because they serve multiple models from
+  // Weird special case for AWS/Azure because they serve multiple models from
  // different vendors, even if currently only one is supported.
-  if (req.service === "aws") {
+  if (req.service === "aws") return getAwsBedrockModelFamily(model);
-    return "aws-claude";
+  if (req.service === "azure") return getAzureOpenAIModelFamily(model);
  }
  switch (req.outboundApi) {
    case "anthropic":
@@ -221,7 +222,11 @@ function processQueue() {
  reqs.filter(Boolean).forEach((req) => {
    if (req?.proceed) {
-      req.log.info({ retries: req.retryCount }, `Dequeuing request.`);
+      const modelFamily = getPartitionForRequest(req!);
      req.log.info({
        retries: req.retryCount,
        partition: modelFamily,
      }, `Dequeuing request.`);
      req.proceed();
    }
  });
@@ -415,6 +420,7 @@ function initStreaming(req: Request) {
    // Some clients have a broken SSE parser that doesn't handle comments
    // correctly. These clients can pass ?badSseParser=true to
    // disable comments in the SSE stream.
    res.write(getHeartbeatPayload());
    return;
  }
@@ -157,8 +157,11 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {
  // TODO: all of this shit is duplicate code
-  public getLockoutPeriod() {
+  public getLockoutPeriod(family: AzureOpenAIModelFamily) {
-    const activeKeys = this.keys.filter((k) => !k.isDisabled);
+    const activeKeys = this.keys.filter(
      (key) => !key.isDisabled && key.modelFamilies.includes(family)
    );
    // Don't lock out if there are no keys available or the queue will stall.
    // Just let it through so the add-key middleware can throw an error.
    if (activeKeys.length === 0) return 0;
@@ -276,12 +276,9 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
      (key) => !key.isDisabled && key.modelFamilies.includes(family)
    );
-    if (activeKeys.length === 0) {
+    // Don't lock out if there are no keys available or the queue will stall.
-      // If there are no active keys for this model we can't fulfill requests.
+    // Just let it through so the add-key middleware can throw an error.
-      // We'll return 0 to let the request through and return an error,
+    if (activeKeys.length === 0) return 0;
      // otherwise the request will be stuck in the queue forever.
      return 0;
    }
    // A key is rate-limited if its `rateLimitedAt` plus the greater of its
    // `rateLimitRequestsReset` and `rateLimitTokensReset` is after the