Measure latency across providers.

2026-05-10 21:50:14 -07:00 · 2024-08-02 07:30:48 -07:00
parent a7ee987e04
commit b26954f326
5 changed files with 51 additions and 4 deletions
@@ -468,7 +468,9 @@ export namespace AixWire_Particles {
    chatIn?: number,
    chatOut?: number,
    chatOutRate?: number,
+    chatTimeStart?: number,
    chatTimeInner?: number,
+    chatTimeTotal?: number,
  };


@@ -33,13 +33,19 @@ import { AnthropicWire_API_Message_Create } from '../../wiretypes/anthropic.wire
 * - Begin/End are explicit
 */
 export function createAnthropicMessageParser(): ChatGenerateParseFunction {
+  const parserCreationTimestamp = Date.now();
  let responseMessage: AnthropicWire_API_Message_Create.Response;
  let hasErrored = false;
+  let timeToFirstEvent: number;
  let messageStartTime: number | undefined = undefined;
  let chatInTokens: number | undefined = undefined;

  return function(pt: IParticleTransmitter, eventData: string, eventName?: string): void {

+    // Time to first event
+    if (timeToFirstEvent === undefined)
+      timeToFirstEvent = Date.now() - parserCreationTimestamp;
+
    // if we've errored, we should not be receiving more data
    if (hasErrored)
      console.log('Anthropic stream has errored already, but received more data:', eventData);
@@ -72,7 +78,11 @@ export function createAnthropicMessageParser(): ChatGenerateParseFunction {
          pt.setModelName(responseMessage.model);
        if (responseMessage.usage) {
          chatInTokens = responseMessage.usage.input_tokens;
-          pt.setCounters({ chatIn: chatInTokens, chatOut: responseMessage.usage.output_tokens });
+          pt.setCounters({
+            chatIn: chatInTokens,
+            chatOut: responseMessage.usage.output_tokens,
+            chatTimeStart: timeToFirstEvent,
+          });
        }
        break;

@@ -164,7 +174,9 @@ export function createAnthropicMessageParser(): ChatGenerateParseFunction {
              chatIn: chatInTokens !== undefined ? chatInTokens : -1,
              chatOut: usage.output_tokens,
              chatOutRate: Math.round(chatOutRate * 100) / 100, // Round to 2 decimal places
+              chatTimeStart: timeToFirstEvent,
              chatTimeInner: elapsedTimeSeconds,
+              chatTimeTotal: Date.now() - parserCreationTimestamp,
            });
          }
        } else
@@ -238,7 +250,9 @@ export function createAnthropicMessageParserNS(): ChatGenerateParseFunction {
        chatIn: usage.input_tokens,
        chatOut: usage.output_tokens,
        chatOutRate: Math.round(chatOutRate * 100) / 100, // Round to 2 decimal places
-        chatTimeInner: elapsedTimeSeconds,
+        // chatTimeStart: // meaningless non-streaming
+        // chatTimeInner: // we don't know
+        chatTimeTotal: elapsedTimeSeconds,
      });
    }
  };
@@ -22,12 +22,19 @@ import { GeminiWire_API_Generate_Content, GeminiWire_Safety } from '../../wirety
 *  Note that non-streaming calls will contain a complete sequence of complete parts.
 */
 export function createGeminiGenerateContentResponseParser(modelId: string): ChatGenerateParseFunction {
+  const parserCreationTimestamp = Date.now();
  const modelName = modelId.replace('models/', '');
  let hasBegun = false;
+  let timeToFirstEvent: number;
+  let skipSendingTotalTimeOnce = true;

  // this can throw, it's caught by the caller
  return function(pt: IParticleTransmitter, eventData: string): void {

+    // Time to first event
+    if (timeToFirstEvent === undefined)
+      timeToFirstEvent = Date.now() - parserCreationTimestamp;
+
    // -> Model
    if (!hasBegun) {
      hasBegun = true;
@@ -125,11 +132,16 @@ export function createGeminiGenerateContentResponseParser(modelId: string): Chat
    }

    // -> Stats
-    if (generationChunk.usageMetadata)
+    if (generationChunk.usageMetadata) {
      pt.setCounters({
        chatIn: generationChunk.usageMetadata.promptTokenCount,
        chatOut: generationChunk.usageMetadata.candidatesTokenCount,
+        chatTimeStart: timeToFirstEvent,
+        // chatTimeInner: // not reported
+        ...skipSendingTotalTimeOnce ? {} : { chatTimeTotal: Date.now() - parserCreationTimestamp }, // the second...end-1 packets will be held
      });
+      skipSendingTotalTimeOnce = false;
+    }

  };
 }
@@ -30,8 +30,10 @@ import { OpenAIWire_API_Chat_Completions } from '../../wiretypes/openai.wiretype
 * - There's no explicit end in this data protocol, but it's handled in the caller with a sse:[DONE] event.
 */
 export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunction {
+  const parserCreationTimestamp = Date.now();
  let hasBegun = false;
  let hasWarned = false;
+  let timeToFirstEvent: number | undefined;
  // NOTE: could compute rate (tok/s) from the first textful event to the last (to ignore the prefill time)

  // Supporting structure to accumulate the assistant message
@@ -52,6 +54,10 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct

  return function(pt: IParticleTransmitter, eventData: string) {

+    // Time to first event
+    if (timeToFirstEvent === undefined)
+      timeToFirstEvent = Date.now() - parserCreationTimestamp;
+
    // Throws on malformed event data
    // ```Can you extend the Zod chunk response object parsing (all optional) to include the missing data? The following is an exampel of the object I received:```
    const chunkData = JSON.parse(eventData); // this is here just for ease of breakpoint, otherwise it could be inlined
@@ -85,6 +91,9 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
        pt.setCounters({
          chatIn: json.usage.prompt_tokens || -1,
          chatOut: json.usage.completion_tokens,
+          ...timeToFirstEvent !== undefined ? { chatTimeStart: timeToFirstEvent } : {},
+          // chatTimeInner: openAI is not reporting the time as seen by the servers
+          chatTimeTotal: Date.now() - parserCreationTimestamp,
        });

      // [OpenAI] Expected correct case: the last object has usage, but an empty choices array
@@ -92,13 +101,18 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
        return;
    }
    // [Groq] -> Stats
+    // Note: if still in queue, reset the event stats, until we're out of the queue
+    if (json.x_groq?.queue_length)
+      timeToFirstEvent = undefined;
    if (json.x_groq?.usage) {
      const { prompt_tokens, completion_tokens, completion_time } = json.x_groq.usage;
      pt.setCounters({
        chatIn: prompt_tokens,
        chatOut: completion_tokens,
        chatOutRate: (completion_tokens && completion_time) ? Math.round((completion_tokens / completion_time) * 100) / 100 : undefined,
-        chatTimeInner: completion_time,
+        ...timeToFirstEvent !== undefined ? { chatTimeStart: timeToFirstEvent } : {},
+        chatTimeInner: Math.round((completion_time || 0) * 1000),
+        chatTimeTotal: Date.now() - parserCreationTimestamp,
      });
    }

@@ -184,6 +198,7 @@ export function createOpenAIChatCompletionsChunkParser(): ChatGenerateParseFunct
 /// OpenAI non-streaming ChatCompletions

 export function createOpenAIChatCompletionsParserNS(): ChatGenerateParseFunction {
+  const parserCreationTimestamp = Date.now();

  return function(pt: IParticleTransmitter, eventData: string) {

@@ -215,6 +230,9 @@ export function createOpenAIChatCompletionsParserNS(): ChatGenerateParseFunction
      pt.setCounters({
        chatIn: json.usage.prompt_tokens,
        chatOut: json.usage.completion_tokens,
+        // chatTimeStart: ... // not meaningful for non-streaming
+        // chatTimeInner: ... // not measured/reportd by OpenAI
+        chatTimeTotal: Date.now() - parserCreationTimestamp,
      });

    // Assumption/validate: expect 1 completion, or stop
@@ -403,6 +403,7 @@ export namespace OpenAIWire_API_Chat_Completions {
        total_tokens: z.number().optional(),
        total_time: z.number().optional(),
      }).optional(),
+      queue_length: z.number().optional(),
    }).optional(),
  });