adds azure rate limit auto-retry

This commit is contained in:
nai-degen
2023-12-04 01:23:55 -06:00
parent fbdea30264
commit fdd824f0e4
7 changed files with 88 additions and 55 deletions
+6 -1
View File
@@ -17,9 +17,14 @@ AZURE_CREDENTIALS=contoso-ml:gpt4-8k:0123456789abcdef0123456789abcdef,northwind-
``` ```
## Model assignment ## Model assignment
Note that each Azure deployment is assigned a model when you create it in the Microsoft Cognitive Services portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model. Note that each Azure deployment is assigned a model when you create it in the Azure OpenAI Service portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
### Supported model IDs ### Supported model IDs
Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID. Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID.
As noted above, you can only use model IDs for which a deployment has been created and added to the proxy. As noted above, you can only use model IDs for which a deployment has been created and added to the proxy.
## On content filtering
Be aware that all Azure OpenAI Service deployments have content filtering enabled by default at a Medium level. Prompts or responses which are deemed to be inappropriate will be rejected by the API. This is a feature of the Azure OpenAI Service and not the proxy.
You can disable this from deployment's settings within Azure, but you would need to request an exemption from Microsoft for your organization first. See [this page](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/content-filters) for more information.
+44
View File
@@ -0,0 +1,44 @@
const axios = require("axios");
const concurrentRequests = 5;
const headers = {
Authorization: "Bearer test",
"Content-Type": "application/json",
};
const payload = {
model: "gpt-4",
max_tokens: 1,
stream: false,
messages: [{ role: "user", content: "Hi" }],
};
const makeRequest = async (i) => {
try {
const response = await axios.post(
"http://localhost:7860/proxy/azure/openai/v1/chat/completions",
payload,
{ headers }
);
console.log(
`Req ${i} finished with status code ${response.status} and response:`,
response.data
);
} catch (error) {
console.error(`Error in req ${i}:`, error.message);
}
};
const executeRequestsConcurrently = () => {
const promises = [];
for (let i = 1; i <= concurrentRequests; i++) {
console.log(`Starting request ${i}`);
promises.push(makeRequest(i));
}
Promise.all(promises).then(() => {
console.log("All requests finished");
});
};
executeRequestsConcurrently();
-40
View File
@@ -1,40 +0,0 @@
$NumThreads = 10
$runspacePool = [runspacefactory]::CreateRunspacePool(1, $NumThreads)
$runspacePool.Open()
$runspaces = @()
$headers = @{
"Authorization" = "Bearer test"
"anthropic-version" = "2023-01-01"
"Content-Type" = "application/json"
}
$payload = @{
model = "claude-v2"
max_tokens_to_sample = 40
temperature = 0
stream = $true
prompt = "Test prompt, please reply with lorem ipsum`n`n:Assistant:"
} | ConvertTo-Json
for ($i = 1; $i -le $NumThreads; $i++) {
Write-Host "Starting thread $i"
$runspace = [powershell]::Create()
$runspace.AddScript({
param($i, $headers, $payload)
$response = Invoke-WebRequest -Uri "http://localhost:7860/proxy/aws/claude/v1/complete" -Method Post -Headers $headers -Body $payload
Write-Host "Response from server: $($response.StatusCode)"
}).AddArgument($i).AddArgument($headers).AddArgument($payload)
$runspace.RunspacePool = $runspacePool
$runspaces += [PSCustomObject]@{ Pipe = $runspace; Status = $runspace.BeginInvoke() }
}
$runspaces | ForEach-Object {
$_.Pipe.EndInvoke($_.Status)
$_.Pipe.Dispose()
}
$runspacePool.Close()
$runspacePool.Dispose()
+19 -1
View File
@@ -343,8 +343,10 @@ const handleUpstreamErrors: ProxyResHandlerWithBody = async (
case "aws": case "aws":
handleAwsRateLimitError(req, errorPayload); handleAwsRateLimitError(req, errorPayload);
break; break;
case "google-palm":
case "azure": case "azure":
handleAzureRateLimitError(req, errorPayload);
break;
case "google-palm":
errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`; errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`;
break; break;
default: default:
@@ -507,6 +509,22 @@ function handleOpenAIRateLimitError(
return errorPayload; return errorPayload;
} }
function handleAzureRateLimitError(
req: Request,
errorPayload: ProxiedErrorPayload
) {
const code = errorPayload.error?.code;
switch (code) {
case "429":
keyPool.markRateLimited(req.key!);
reenqueueRequest(req);
throw new RetryableError("Rate-limited request re-enqueued.");
default:
errorPayload.proxy_note = `Unrecognized rate limit error from Azure (${code}). Please report this.`;
break;
}
}
const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => { const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) { if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) {
const model = req.body.model; const model = req.body.model;
+11 -5
View File
@@ -15,6 +15,8 @@ import crypto from "crypto";
import type { Handler, Request } from "express"; import type { Handler, Request } from "express";
import { keyPool } from "../shared/key-management"; import { keyPool } from "../shared/key-management";
import { import {
getAwsBedrockModelFamily,
getAzureOpenAIModelFamily,
getClaudeModelFamily, getClaudeModelFamily,
getGooglePalmModelFamily, getGooglePalmModelFamily,
getOpenAIModelFamily, getOpenAIModelFamily,
@@ -136,11 +138,10 @@ function getPartitionForRequest(req: Request): ModelFamily {
// they should be treated as separate queues. // they should be treated as separate queues.
const model = req.body.model ?? "gpt-3.5-turbo"; const model = req.body.model ?? "gpt-3.5-turbo";
// Weird special case for AWS because they serve multiple models from // Weird special case for AWS/Azure because they serve multiple models from
// different vendors, even if currently only one is supported. // different vendors, even if currently only one is supported.
if (req.service === "aws") { if (req.service === "aws") return getAwsBedrockModelFamily(model);
return "aws-claude"; if (req.service === "azure") return getAzureOpenAIModelFamily(model);
}
switch (req.outboundApi) { switch (req.outboundApi) {
case "anthropic": case "anthropic":
@@ -221,7 +222,11 @@ function processQueue() {
reqs.filter(Boolean).forEach((req) => { reqs.filter(Boolean).forEach((req) => {
if (req?.proceed) { if (req?.proceed) {
req.log.info({ retries: req.retryCount }, `Dequeuing request.`); const modelFamily = getPartitionForRequest(req!);
req.log.info({
retries: req.retryCount,
partition: modelFamily,
}, `Dequeuing request.`);
req.proceed(); req.proceed();
} }
}); });
@@ -415,6 +420,7 @@ function initStreaming(req: Request) {
// Some clients have a broken SSE parser that doesn't handle comments // Some clients have a broken SSE parser that doesn't handle comments
// correctly. These clients can pass ?badSseParser=true to // correctly. These clients can pass ?badSseParser=true to
// disable comments in the SSE stream. // disable comments in the SSE stream.
res.write(getHeartbeatPayload());
return; return;
} }
+5 -2
View File
@@ -157,8 +157,11 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {
// TODO: all of this shit is duplicate code // TODO: all of this shit is duplicate code
public getLockoutPeriod() { public getLockoutPeriod(family: AzureOpenAIModelFamily) {
const activeKeys = this.keys.filter((k) => !k.isDisabled); const activeKeys = this.keys.filter(
(key) => !key.isDisabled && key.modelFamilies.includes(family)
);
// Don't lock out if there are no keys available or the queue will stall. // Don't lock out if there are no keys available or the queue will stall.
// Just let it through so the add-key middleware can throw an error. // Just let it through so the add-key middleware can throw an error.
if (activeKeys.length === 0) return 0; if (activeKeys.length === 0) return 0;
+3 -6
View File
@@ -276,12 +276,9 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
(key) => !key.isDisabled && key.modelFamilies.includes(family) (key) => !key.isDisabled && key.modelFamilies.includes(family)
); );
if (activeKeys.length === 0) { // Don't lock out if there are no keys available or the queue will stall.
// If there are no active keys for this model we can't fulfill requests. // Just let it through so the add-key middleware can throw an error.
// We'll return 0 to let the request through and return an error, if (activeKeys.length === 0) return 0;
// otherwise the request will be stuck in the queue forever.
return 0;
}
// A key is rate-limited if its `rateLimitedAt` plus the greater of its // A key is rate-limited if its `rateLimitedAt` plus the greater of its
// `rateLimitRequestsReset` and `rateLimitTokensReset` is after the // `rateLimitRequestsReset` and `rateLimitTokensReset` is after the