adds azure rate limit auto-retry
This commit is contained in:
@@ -17,9 +17,14 @@ AZURE_CREDENTIALS=contoso-ml:gpt4-8k:0123456789abcdef0123456789abcdef,northwind-
|
|||||||
```
|
```
|
||||||
|
|
||||||
## Model assignment
|
## Model assignment
|
||||||
Note that each Azure deployment is assigned a model when you create it in the Microsoft Cognitive Services portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
|
Note that each Azure deployment is assigned a model when you create it in the Azure OpenAI Service portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
|
||||||
|
|
||||||
### Supported model IDs
|
### Supported model IDs
|
||||||
Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID.
|
Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID.
|
||||||
|
|
||||||
As noted above, you can only use model IDs for which a deployment has been created and added to the proxy.
|
As noted above, you can only use model IDs for which a deployment has been created and added to the proxy.
|
||||||
|
|
||||||
|
## On content filtering
|
||||||
|
Be aware that all Azure OpenAI Service deployments have content filtering enabled by default at a Medium level. Prompts or responses which are deemed to be inappropriate will be rejected by the API. This is a feature of the Azure OpenAI Service and not the proxy.
|
||||||
|
|
||||||
|
You can disable this from deployment's settings within Azure, but you would need to request an exemption from Microsoft for your organization first. See [this page](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/content-filters) for more information.
|
||||||
|
|||||||
@@ -0,0 +1,44 @@
|
|||||||
|
const axios = require("axios");
|
||||||
|
|
||||||
|
const concurrentRequests = 5;
|
||||||
|
const headers = {
|
||||||
|
Authorization: "Bearer test",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
};
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
model: "gpt-4",
|
||||||
|
max_tokens: 1,
|
||||||
|
stream: false,
|
||||||
|
messages: [{ role: "user", content: "Hi" }],
|
||||||
|
};
|
||||||
|
|
||||||
|
const makeRequest = async (i) => {
|
||||||
|
try {
|
||||||
|
const response = await axios.post(
|
||||||
|
"http://localhost:7860/proxy/azure/openai/v1/chat/completions",
|
||||||
|
payload,
|
||||||
|
{ headers }
|
||||||
|
);
|
||||||
|
console.log(
|
||||||
|
`Req ${i} finished with status code ${response.status} and response:`,
|
||||||
|
response.data
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error in req ${i}:`, error.message);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const executeRequestsConcurrently = () => {
|
||||||
|
const promises = [];
|
||||||
|
for (let i = 1; i <= concurrentRequests; i++) {
|
||||||
|
console.log(`Starting request ${i}`);
|
||||||
|
promises.push(makeRequest(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
Promise.all(promises).then(() => {
|
||||||
|
console.log("All requests finished");
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
executeRequestsConcurrently();
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
$NumThreads = 10
|
|
||||||
|
|
||||||
$runspacePool = [runspacefactory]::CreateRunspacePool(1, $NumThreads)
|
|
||||||
$runspacePool.Open()
|
|
||||||
$runspaces = @()
|
|
||||||
|
|
||||||
$headers = @{
|
|
||||||
"Authorization" = "Bearer test"
|
|
||||||
"anthropic-version" = "2023-01-01"
|
|
||||||
"Content-Type" = "application/json"
|
|
||||||
}
|
|
||||||
|
|
||||||
$payload = @{
|
|
||||||
model = "claude-v2"
|
|
||||||
max_tokens_to_sample = 40
|
|
||||||
temperature = 0
|
|
||||||
stream = $true
|
|
||||||
prompt = "Test prompt, please reply with lorem ipsum`n`n:Assistant:"
|
|
||||||
} | ConvertTo-Json
|
|
||||||
|
|
||||||
for ($i = 1; $i -le $NumThreads; $i++) {
|
|
||||||
Write-Host "Starting thread $i"
|
|
||||||
$runspace = [powershell]::Create()
|
|
||||||
$runspace.AddScript({
|
|
||||||
param($i, $headers, $payload)
|
|
||||||
$response = Invoke-WebRequest -Uri "http://localhost:7860/proxy/aws/claude/v1/complete" -Method Post -Headers $headers -Body $payload
|
|
||||||
Write-Host "Response from server: $($response.StatusCode)"
|
|
||||||
}).AddArgument($i).AddArgument($headers).AddArgument($payload)
|
|
||||||
|
|
||||||
$runspace.RunspacePool = $runspacePool
|
|
||||||
$runspaces += [PSCustomObject]@{ Pipe = $runspace; Status = $runspace.BeginInvoke() }
|
|
||||||
}
|
|
||||||
|
|
||||||
$runspaces | ForEach-Object {
|
|
||||||
$_.Pipe.EndInvoke($_.Status)
|
|
||||||
$_.Pipe.Dispose()
|
|
||||||
}
|
|
||||||
|
|
||||||
$runspacePool.Close()
|
|
||||||
$runspacePool.Dispose()
|
|
||||||
@@ -343,8 +343,10 @@ const handleUpstreamErrors: ProxyResHandlerWithBody = async (
|
|||||||
case "aws":
|
case "aws":
|
||||||
handleAwsRateLimitError(req, errorPayload);
|
handleAwsRateLimitError(req, errorPayload);
|
||||||
break;
|
break;
|
||||||
case "google-palm":
|
|
||||||
case "azure":
|
case "azure":
|
||||||
|
handleAzureRateLimitError(req, errorPayload);
|
||||||
|
break;
|
||||||
|
case "google-palm":
|
||||||
errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`;
|
errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@@ -507,6 +509,22 @@ function handleOpenAIRateLimitError(
|
|||||||
return errorPayload;
|
return errorPayload;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function handleAzureRateLimitError(
|
||||||
|
req: Request,
|
||||||
|
errorPayload: ProxiedErrorPayload
|
||||||
|
) {
|
||||||
|
const code = errorPayload.error?.code;
|
||||||
|
switch (code) {
|
||||||
|
case "429":
|
||||||
|
keyPool.markRateLimited(req.key!);
|
||||||
|
reenqueueRequest(req);
|
||||||
|
throw new RetryableError("Rate-limited request re-enqueued.");
|
||||||
|
default:
|
||||||
|
errorPayload.proxy_note = `Unrecognized rate limit error from Azure (${code}). Please report this.`;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
|
const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
|
||||||
if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) {
|
if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) {
|
||||||
const model = req.body.model;
|
const model = req.body.model;
|
||||||
|
|||||||
+11
-5
@@ -15,6 +15,8 @@ import crypto from "crypto";
|
|||||||
import type { Handler, Request } from "express";
|
import type { Handler, Request } from "express";
|
||||||
import { keyPool } from "../shared/key-management";
|
import { keyPool } from "../shared/key-management";
|
||||||
import {
|
import {
|
||||||
|
getAwsBedrockModelFamily,
|
||||||
|
getAzureOpenAIModelFamily,
|
||||||
getClaudeModelFamily,
|
getClaudeModelFamily,
|
||||||
getGooglePalmModelFamily,
|
getGooglePalmModelFamily,
|
||||||
getOpenAIModelFamily,
|
getOpenAIModelFamily,
|
||||||
@@ -136,11 +138,10 @@ function getPartitionForRequest(req: Request): ModelFamily {
|
|||||||
// they should be treated as separate queues.
|
// they should be treated as separate queues.
|
||||||
const model = req.body.model ?? "gpt-3.5-turbo";
|
const model = req.body.model ?? "gpt-3.5-turbo";
|
||||||
|
|
||||||
// Weird special case for AWS because they serve multiple models from
|
// Weird special case for AWS/Azure because they serve multiple models from
|
||||||
// different vendors, even if currently only one is supported.
|
// different vendors, even if currently only one is supported.
|
||||||
if (req.service === "aws") {
|
if (req.service === "aws") return getAwsBedrockModelFamily(model);
|
||||||
return "aws-claude";
|
if (req.service === "azure") return getAzureOpenAIModelFamily(model);
|
||||||
}
|
|
||||||
|
|
||||||
switch (req.outboundApi) {
|
switch (req.outboundApi) {
|
||||||
case "anthropic":
|
case "anthropic":
|
||||||
@@ -221,7 +222,11 @@ function processQueue() {
|
|||||||
|
|
||||||
reqs.filter(Boolean).forEach((req) => {
|
reqs.filter(Boolean).forEach((req) => {
|
||||||
if (req?.proceed) {
|
if (req?.proceed) {
|
||||||
req.log.info({ retries: req.retryCount }, `Dequeuing request.`);
|
const modelFamily = getPartitionForRequest(req!);
|
||||||
|
req.log.info({
|
||||||
|
retries: req.retryCount,
|
||||||
|
partition: modelFamily,
|
||||||
|
}, `Dequeuing request.`);
|
||||||
req.proceed();
|
req.proceed();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -415,6 +420,7 @@ function initStreaming(req: Request) {
|
|||||||
// Some clients have a broken SSE parser that doesn't handle comments
|
// Some clients have a broken SSE parser that doesn't handle comments
|
||||||
// correctly. These clients can pass ?badSseParser=true to
|
// correctly. These clients can pass ?badSseParser=true to
|
||||||
// disable comments in the SSE stream.
|
// disable comments in the SSE stream.
|
||||||
|
res.write(getHeartbeatPayload());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -157,8 +157,11 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {
|
|||||||
|
|
||||||
// TODO: all of this shit is duplicate code
|
// TODO: all of this shit is duplicate code
|
||||||
|
|
||||||
public getLockoutPeriod() {
|
public getLockoutPeriod(family: AzureOpenAIModelFamily) {
|
||||||
const activeKeys = this.keys.filter((k) => !k.isDisabled);
|
const activeKeys = this.keys.filter(
|
||||||
|
(key) => !key.isDisabled && key.modelFamilies.includes(family)
|
||||||
|
);
|
||||||
|
|
||||||
// Don't lock out if there are no keys available or the queue will stall.
|
// Don't lock out if there are no keys available or the queue will stall.
|
||||||
// Just let it through so the add-key middleware can throw an error.
|
// Just let it through so the add-key middleware can throw an error.
|
||||||
if (activeKeys.length === 0) return 0;
|
if (activeKeys.length === 0) return 0;
|
||||||
|
|||||||
@@ -276,12 +276,9 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
|
|||||||
(key) => !key.isDisabled && key.modelFamilies.includes(family)
|
(key) => !key.isDisabled && key.modelFamilies.includes(family)
|
||||||
);
|
);
|
||||||
|
|
||||||
if (activeKeys.length === 0) {
|
// Don't lock out if there are no keys available or the queue will stall.
|
||||||
// If there are no active keys for this model we can't fulfill requests.
|
// Just let it through so the add-key middleware can throw an error.
|
||||||
// We'll return 0 to let the request through and return an error,
|
if (activeKeys.length === 0) return 0;
|
||||||
// otherwise the request will be stuck in the queue forever.
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// A key is rate-limited if its `rateLimitedAt` plus the greater of its
|
// A key is rate-limited if its `rateLimitedAt` plus the greater of its
|
||||||
// `rateLimitRequestsReset` and `rateLimitTokensReset` is after the
|
// `rateLimitRequestsReset` and `rateLimitTokensReset` is after the
|
||||||
|
|||||||
Reference in New Issue
Block a user