Gemini: support tiered cache pricing

2026-05-10 21:50:14 -07:00 · 2025-06-06 11:17:40 -07:00
parent 31bb06293d
commit 1112aa292f
1 changed files with 9 additions and 11 deletions
@@ -179,9 +179,9 @@ export function metricsComputeChatGenerateCostsMd(metrics?: Readonly<DMetricsCha
  const isPartialMessage = metrics.TsR === 'pending' || metrics.TsR === 'aborted';

  // Calculate costs
-  const tierTokens = sumInputTokens;
-  const $inNew = getLlmCostForTokens(tierTokens, inNewTokens, pricing.input);
-  const $out = getLlmCostForTokens(tierTokens, outTokens, pricing.output);
+  const inputTierTokens = sumInputTokens;
+  const $inNew = getLlmCostForTokens(inputTierTokens, inNewTokens, pricing.input);
+  const $out = getLlmCostForTokens(outTokens, outTokens, pricing.output);
  if ($inNew === undefined || $out === undefined) {
    // many llms don't have pricing information, so the cost computation ends here
    return { $code: 'partial-price' };
@@ -201,14 +201,12 @@ export function metricsComputeChatGenerateCostsMd(metrics?: Readonly<DMetricsCha
    return { $c: $noCacheRounded, $code: 'partial-price' };
  }

-  // 2024-08-22: DEV Note: we put this here to break in case we start having tiered price with cache,
-  // for which we don't know if the tier discriminator is the input tokens level, or the equivalent
-  // tokens level (input + cache)
-  if (Array.isArray(cachePricing.read) || ('write' in cachePricing && Array.isArray(cachePricing.write)))
-    throw new Error('Tiered pricing with cache is not supported');
+  // 2025-01-10: Now supporting tiered cache pricing
+  // Note: We use the total input tokens (new + cache) as the tier discriminator,
+  // as this aligns with how providers typically structure their pricing tiers

  // compute the input cache read costs
-  const $cacheRead = getLlmCostForTokens(tierTokens, inCacheReadTokens, cachePricing.read);
+  const $cacheRead = getLlmCostForTokens(inputTierTokens, inCacheReadTokens, cachePricing.read);
  if ($cacheRead === undefined) {
    console.log(`Missing cache read pricing for ${logLlmRefId}`);
    return { $c: $noCacheRounded, $code: 'partial-price' };
@@ -218,7 +216,7 @@ export function metricsComputeChatGenerateCostsMd(metrics?: Readonly<DMetricsCha
  let $cacheWrite;
  switch (cachePricing.cType) {
    case 'ant-bp':
-      $cacheWrite = getLlmCostForTokens(tierTokens, inCacheWriteTokens, cachePricing.write);
+      $cacheWrite = getLlmCostForTokens(inputTierTokens, inCacheWriteTokens, cachePricing.write);
      break;
    case 'oai-ac':
      $cacheWrite = 0;
@@ -235,7 +233,7 @@ export function metricsComputeChatGenerateCostsMd(metrics?: Readonly<DMetricsCha
  const $c = Math.round(($inNew + $cacheRead + $cacheWrite + $out) * USD_TO_CENTS * 10000) / 10000;

  // compute the advantage from caching
-  const $inAsIfNoCache = getLlmCostForTokens(tierTokens, sumInputTokens, pricing.input)!;
+  const $inAsIfNoCache = getLlmCostForTokens(inputTierTokens, sumInputTokens, pricing.input)!;
  const $cdCache = Math.round(($inAsIfNoCache - $inNew - $cacheRead - $cacheWrite) * USD_TO_CENTS * 10000) / 10000;

  // mark the costs as partial if the message was not completely received - i.e. the server did not tell us the final tokens count