mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
Measure latency across providers.
This commit is contained in:
@@ -468,7 +468,9 @@ export namespace AixWire_Particles {
|
||||
chatIn?: number,
|
||||
chatOut?: number,
|
||||
chatOutRate?: number,
|
||||
chatTimeStart?: number,
|
||||
chatTimeInner?: number,
|
||||
chatTimeTotal?: number,
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -33,13 +33,19 @@ import { AnthropicWire_API_Message_Create } from '../../wiretypes/anthropic.wire
|
||||
* - Begin/End are explicit
|
||||
*/
|
||||
export function createAnthropicMessageParser(): ChatGenerateParseFunction {
|
||||
const parserCreationTimestamp = Date.now();
|
||||
let responseMessage: AnthropicWire_API_Message_Create.Response;
|
||||
let hasErrored = false;
|
||||
let timeToFirstEvent: number;
|
||||
let messageStartTime: number | undefined = undefined;
|
||||
let chatInTokens: number | undefined = undefined;
|
||||
|
||||
return function(pt: IParticleTransmitter, eventData: string, eventName?: string): void {
|
||||
|
||||
// Time to first event
|
||||
if (timeToFirstEvent === undefined)
|
||||
timeToFirstEvent = Date.now() - parserCreationTimestamp;
|
||||
|
||||
// if we've errored, we should not be receiving more data
|
||||
if (hasErrored)
|
||||
console.log('Anthropic stream has errored already, but received more data:', eventData);
|
||||
@@ -72,7 +78,11 @@ export function createAnthropicMessageParser(): ChatGenerateParseFunction {
|
||||
pt.setModelName(responseMessage.model);
|
||||
if (responseMessage.usage) {
|
||||
chatInTokens = responseMessage.usage.input_tokens;
|
||||
pt.setCounters({ chatIn: chatInTokens, chatOut: responseMessage.usage.output_tokens });
|
||||
pt.setCounters({
|
||||
chatIn: chatInTokens,
|
||||
chatOut: responseMessage.usage.output_tokens,
|
||||
chatTimeStart: timeToFirstEvent,
|
||||
});
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -164,7 +174,9 @@ export function createAnthropicMessageParser(): ChatGenerateParseFunction {
|
||||
chatIn: chatInTokens !== undefined ? chatInTokens : -1,
|
||||
chatOut: usage.output_tokens,
|
||||
chatOutRate: Math.round(chatOutRate * 100) / 100, // Round to 2 decimal places
|
||||
chatTimeStart: timeToFirstEvent,
|
||||
chatTimeInner: elapsedTimeSeconds,
|
||||
chatTimeTotal: Date.now() - parserCreationTimestamp,
|
||||
});
|
||||
}
|
||||
} else
|
||||
@@ -238,7 +250,9 @@ export function createAnthropicMessageParserNS(): ChatGenerateParseFunction {
|
||||
chatIn: usage.input_tokens,
|
||||
chatOut: usage.output_tokens,
|
||||
chatOutRate: Math.round(chatOutRate * 100) / 100, // Round to 2 decimal places
|
||||
chatTimeInner: elapsedTimeSeconds,
|
||||
// chatTimeStart: // meaningless non-streaming
|
||||
// chatTimeInner: // we don't know
|
||||
chatTimeTotal: elapsedTimeSeconds,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -22,12 +22,19 @@ import { GeminiWire_API_Generate_Content, GeminiWire_Safety } from '../../wirety
|
||||
* Note that non-streaming calls will contain a complete sequence of complete parts.
|
||||
*/
|
||||
export function createGeminiGenerateContentResponseParser(modelId: string): ChatGenerateParseFunction {
|
||||
const parserCreationTimestamp = Date.now();
|
||||
const modelName = modelId.replace('models/', '');
|
||||
let hasBegun = false;
|
||||
let timeToFirstEvent: number;
|
||||
let skipSendingTotalTimeOnce = true;
|
||||
|
||||
// this can throw, it's caught by the caller
|
||||
return function(pt: IParticleTransmitter, eventData: string): void {
|
||||
|
||||
// Time to first event
|
||||
if (timeToFirstEvent === undefined)
|
||||
timeToFirstEvent = Date.now() - parserCreationTimestamp;
|
||||
|
||||
// -> Model
|
||||
if (!hasBegun) {
|
||||
hasBegun = true;
|
||||
@@ -125,11 +132,16 @@ export function createGeminiGenerateContentResponseParser(modelId: string): Chat
|
||||
}
|
||||
|
||||
// -> Stats
|
||||
if (generationChunk.usageMetadata)
|
||||
if (generationChunk.usageMetadata) {
|
||||
pt.setCounters({
|
||||
chatIn: generationChunk.usageMetadata.promptTokenCount,
|
||||
chatOut: generationChunk.usageMetadata.candidatesTokenCount,
|
||||
chatTimeStart: timeToFirstEvent,
|
||||
// chatTimeInner: // not reported
|
||||
...skipSendingTotalTimeOnce ? {} : { chatTimeTotal: Date.now() - parserCreationTimestamp }, // the second...end-1 packets will be held
|
||||
});
|
||||
skipSendingTotalTimeOnce = false;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
@@ -30,8 +30,10 @@ import { OpenAIWire_API_Chat_Completions } from '../../wiretypes/openai.wiretype
|
||||
* - There's no explicit end in this data protocol, but it's handled in the caller with a sse:[DONE] event.
|
||||
*/
|
||||
export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunction {
|
||||
const parserCreationTimestamp = Date.now();
|
||||
let hasBegun = false;
|
||||
let hasWarned = false;
|
||||
let timeToFirstEvent: number | undefined;
|
||||
// NOTE: could compute rate (tok/s) from the first textful event to the last (to ignore the prefill time)
|
||||
|
||||
// Supporting structure to accumulate the assistant message
|
||||
@@ -52,6 +54,10 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
|
||||
|
||||
return function(pt: IParticleTransmitter, eventData: string) {
|
||||
|
||||
// Time to first event
|
||||
if (timeToFirstEvent === undefined)
|
||||
timeToFirstEvent = Date.now() - parserCreationTimestamp;
|
||||
|
||||
// Throws on malformed event data
|
||||
// ```Can you extend the Zod chunk response object parsing (all optional) to include the missing data? The following is an exampel of the object I received:```
|
||||
const chunkData = JSON.parse(eventData); // this is here just for ease of breakpoint, otherwise it could be inlined
|
||||
@@ -85,6 +91,9 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
|
||||
pt.setCounters({
|
||||
chatIn: json.usage.prompt_tokens || -1,
|
||||
chatOut: json.usage.completion_tokens,
|
||||
...timeToFirstEvent !== undefined ? { chatTimeStart: timeToFirstEvent } : {},
|
||||
// chatTimeInner: openAI is not reporting the time as seen by the servers
|
||||
chatTimeTotal: Date.now() - parserCreationTimestamp,
|
||||
});
|
||||
|
||||
// [OpenAI] Expected correct case: the last object has usage, but an empty choices array
|
||||
@@ -92,13 +101,18 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
|
||||
return;
|
||||
}
|
||||
// [Groq] -> Stats
|
||||
// Note: if still in queue, reset the event stats, until we're out of the queue
|
||||
if (json.x_groq?.queue_length)
|
||||
timeToFirstEvent = undefined;
|
||||
if (json.x_groq?.usage) {
|
||||
const { prompt_tokens, completion_tokens, completion_time } = json.x_groq.usage;
|
||||
pt.setCounters({
|
||||
chatIn: prompt_tokens,
|
||||
chatOut: completion_tokens,
|
||||
chatOutRate: (completion_tokens && completion_time) ? Math.round((completion_tokens / completion_time) * 100) / 100 : undefined,
|
||||
chatTimeInner: completion_time,
|
||||
...timeToFirstEvent !== undefined ? { chatTimeStart: timeToFirstEvent } : {},
|
||||
chatTimeInner: Math.round((completion_time || 0) * 1000),
|
||||
chatTimeTotal: Date.now() - parserCreationTimestamp,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -184,6 +198,7 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
|
||||
/// OpenAI non-streaming ChatCompletions
|
||||
|
||||
export function createOpenAIChatCompletionsParserNS(): ChatGenerateParseFunction {
|
||||
const parserCreationTimestamp = Date.now();
|
||||
|
||||
return function(pt: IParticleTransmitter, eventData: string) {
|
||||
|
||||
@@ -215,6 +230,9 @@ export function createOpenAIChatCompletionsParserNS(): ChatGenerateParseFunction
|
||||
pt.setCounters({
|
||||
chatIn: json.usage.prompt_tokens,
|
||||
chatOut: json.usage.completion_tokens,
|
||||
// chatTimeStart: ... // not meaningful for non-streaming
|
||||
// chatTimeInner: ... // not measured/reportd by OpenAI
|
||||
chatTimeTotal: Date.now() - parserCreationTimestamp,
|
||||
});
|
||||
|
||||
// Assumption/validate: expect 1 completion, or stop
|
||||
|
||||
@@ -403,6 +403,7 @@ export namespace OpenAIWire_API_Chat_Completions {
|
||||
total_tokens: z.number().optional(),
|
||||
total_time: z.number().optional(),
|
||||
}).optional(),
|
||||
queue_length: z.number().optional(),
|
||||
}).optional(),
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user