Measure latency across providers.

This commit is contained in:
Enrico Ros
2024-08-02 07:30:48 -07:00
parent a7ee987e04
commit b26954f326
5 changed files with 51 additions and 4 deletions
@@ -468,7 +468,9 @@ export namespace AixWire_Particles {
chatIn?: number,
chatOut?: number,
chatOutRate?: number,
chatTimeStart?: number,
chatTimeInner?: number,
chatTimeTotal?: number,
};
@@ -33,13 +33,19 @@ import { AnthropicWire_API_Message_Create } from '../../wiretypes/anthropic.wire
* - Begin/End are explicit
*/
export function createAnthropicMessageParser(): ChatGenerateParseFunction {
const parserCreationTimestamp = Date.now();
let responseMessage: AnthropicWire_API_Message_Create.Response;
let hasErrored = false;
let timeToFirstEvent: number;
let messageStartTime: number | undefined = undefined;
let chatInTokens: number | undefined = undefined;
return function(pt: IParticleTransmitter, eventData: string, eventName?: string): void {
// Time to first event
if (timeToFirstEvent === undefined)
timeToFirstEvent = Date.now() - parserCreationTimestamp;
// if we've errored, we should not be receiving more data
if (hasErrored)
console.log('Anthropic stream has errored already, but received more data:', eventData);
@@ -72,7 +78,11 @@ export function createAnthropicMessageParser(): ChatGenerateParseFunction {
pt.setModelName(responseMessage.model);
if (responseMessage.usage) {
chatInTokens = responseMessage.usage.input_tokens;
pt.setCounters({ chatIn: chatInTokens, chatOut: responseMessage.usage.output_tokens });
pt.setCounters({
chatIn: chatInTokens,
chatOut: responseMessage.usage.output_tokens,
chatTimeStart: timeToFirstEvent,
});
}
break;
@@ -164,7 +174,9 @@ export function createAnthropicMessageParser(): ChatGenerateParseFunction {
chatIn: chatInTokens !== undefined ? chatInTokens : -1,
chatOut: usage.output_tokens,
chatOutRate: Math.round(chatOutRate * 100) / 100, // Round to 2 decimal places
chatTimeStart: timeToFirstEvent,
chatTimeInner: elapsedTimeSeconds,
chatTimeTotal: Date.now() - parserCreationTimestamp,
});
}
} else
@@ -238,7 +250,9 @@ export function createAnthropicMessageParserNS(): ChatGenerateParseFunction {
chatIn: usage.input_tokens,
chatOut: usage.output_tokens,
chatOutRate: Math.round(chatOutRate * 100) / 100, // Round to 2 decimal places
chatTimeInner: elapsedTimeSeconds,
// chatTimeStart: // meaningless non-streaming
// chatTimeInner: // we don't know
chatTimeTotal: elapsedTimeSeconds,
});
}
};
@@ -22,12 +22,19 @@ import { GeminiWire_API_Generate_Content, GeminiWire_Safety } from '../../wirety
* Note that non-streaming calls will contain a complete sequence of complete parts.
*/
export function createGeminiGenerateContentResponseParser(modelId: string): ChatGenerateParseFunction {
const parserCreationTimestamp = Date.now();
const modelName = modelId.replace('models/', '');
let hasBegun = false;
let timeToFirstEvent: number;
let skipSendingTotalTimeOnce = true;
// this can throw, it's caught by the caller
return function(pt: IParticleTransmitter, eventData: string): void {
// Time to first event
if (timeToFirstEvent === undefined)
timeToFirstEvent = Date.now() - parserCreationTimestamp;
// -> Model
if (!hasBegun) {
hasBegun = true;
@@ -125,11 +132,16 @@ export function createGeminiGenerateContentResponseParser(modelId: string): Chat
}
// -> Stats
if (generationChunk.usageMetadata)
if (generationChunk.usageMetadata) {
pt.setCounters({
chatIn: generationChunk.usageMetadata.promptTokenCount,
chatOut: generationChunk.usageMetadata.candidatesTokenCount,
chatTimeStart: timeToFirstEvent,
// chatTimeInner: // not reported
...skipSendingTotalTimeOnce ? {} : { chatTimeTotal: Date.now() - parserCreationTimestamp }, // the second...end-1 packets will be held
});
skipSendingTotalTimeOnce = false;
}
};
}
@@ -30,8 +30,10 @@ import { OpenAIWire_API_Chat_Completions } from '../../wiretypes/openai.wiretype
* - There's no explicit end in this data protocol, but it's handled in the caller with a sse:[DONE] event.
*/
export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunction {
const parserCreationTimestamp = Date.now();
let hasBegun = false;
let hasWarned = false;
let timeToFirstEvent: number | undefined;
// NOTE: could compute rate (tok/s) from the first textful event to the last (to ignore the prefill time)
// Supporting structure to accumulate the assistant message
@@ -52,6 +54,10 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
return function(pt: IParticleTransmitter, eventData: string) {
// Time to first event
if (timeToFirstEvent === undefined)
timeToFirstEvent = Date.now() - parserCreationTimestamp;
// Throws on malformed event data
// ```Can you extend the Zod chunk response object parsing (all optional) to include the missing data? The following is an exampel of the object I received:```
const chunkData = JSON.parse(eventData); // this is here just for ease of breakpoint, otherwise it could be inlined
@@ -85,6 +91,9 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
pt.setCounters({
chatIn: json.usage.prompt_tokens || -1,
chatOut: json.usage.completion_tokens,
...timeToFirstEvent !== undefined ? { chatTimeStart: timeToFirstEvent } : {},
// chatTimeInner: openAI is not reporting the time as seen by the servers
chatTimeTotal: Date.now() - parserCreationTimestamp,
});
// [OpenAI] Expected correct case: the last object has usage, but an empty choices array
@@ -92,13 +101,18 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
return;
}
// [Groq] -> Stats
// Note: if still in queue, reset the event stats, until we're out of the queue
if (json.x_groq?.queue_length)
timeToFirstEvent = undefined;
if (json.x_groq?.usage) {
const { prompt_tokens, completion_tokens, completion_time } = json.x_groq.usage;
pt.setCounters({
chatIn: prompt_tokens,
chatOut: completion_tokens,
chatOutRate: (completion_tokens && completion_time) ? Math.round((completion_tokens / completion_time) * 100) / 100 : undefined,
chatTimeInner: completion_time,
...timeToFirstEvent !== undefined ? { chatTimeStart: timeToFirstEvent } : {},
chatTimeInner: Math.round((completion_time || 0) * 1000),
chatTimeTotal: Date.now() - parserCreationTimestamp,
});
}
@@ -184,6 +198,7 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
/// OpenAI non-streaming ChatCompletions
export function createOpenAIChatCompletionsParserNS(): ChatGenerateParseFunction {
const parserCreationTimestamp = Date.now();
return function(pt: IParticleTransmitter, eventData: string) {
@@ -215,6 +230,9 @@ export function createOpenAIChatCompletionsParserNS(): ChatGenerateParseFunction
pt.setCounters({
chatIn: json.usage.prompt_tokens,
chatOut: json.usage.completion_tokens,
// chatTimeStart: ... // not meaningful for non-streaming
// chatTimeInner: ... // not measured/reportd by OpenAI
chatTimeTotal: Date.now() - parserCreationTimestamp,
});
// Assumption/validate: expect 1 completion, or stop
@@ -403,6 +403,7 @@ export namespace OpenAIWire_API_Chat_Completions {
total_tokens: z.number().optional(),
total_time: z.number().optional(),
}).optional(),
queue_length: z.number().optional(),
}).optional(),
});