adds Claude 3 Vision support

2024-03-05 18:34:10 -06:00
parent ea3aae5da6
commit ddf34685df
5 changed files with 102 additions and 41 deletions
@@ -1,6 +1,10 @@
 import { getTokenizer } from "@anthropic-ai/tokenizer";
 import { Tiktoken } from "tiktoken/lite";
 import { AnthropicChatMessage } from "../api-schemas";
+import { libSharp } from "../file-storage";
+import { logger } from "../../logger";
+
+const log = logger.child({ module: "tokenizer", service: "anthropic" });

 let encoder: Tiktoken;
 let userRoleCount = 0;
@@ -15,7 +19,7 @@ export function init() {
  return true;
 }

-export function getTokenCount(prompt: string | AnthropicChatMessage[]) {
+export async function getTokenCount(prompt: string | AnthropicChatMessage[]) {
  if (typeof prompt !== "string") {
    return getTokenCountForMessages(prompt);
  }
@@ -30,7 +34,7 @@ export function getTokenCount(prompt: string | AnthropicChatMessage[]) {
  };
 }

-function getTokenCountForMessages(messages: AnthropicChatMessage[]) {
+async function getTokenCountForMessages(messages: AnthropicChatMessage[]) {
  let numTokens = 0;

  for (const message of messages) {
@@ -39,20 +43,23 @@ function getTokenCountForMessages(messages: AnthropicChatMessage[]) {

    const parts = Array.isArray(content)
      ? content
-      : [{ type: "text", text: content }];
+      : [{ type: "text" as const, text: content }];

    for (const part of parts) {
-      // We don't allow other content types for now because we can't estimate
-      // cost for them.
-      if (part.type !== "text") {
-        throw new Error(`Unsupported Anthropic content type: ${part.type}`);
+      switch (part.type) {
+        case "text":
+          const { text } = part;
+          if (text.length > 800000 || numTokens > 200000) {
+            throw new Error("Text content is too large to tokenize.");
+          }
+          numTokens += encoder.encode(text.normalize("NFKC"), "all").length;
+          break;
+        case "image":
+          numTokens += await getImageTokenCount(part.source.data);
+          break;
+        default:
+          throw new Error(`Unsupported Anthropic content type.`);
      }
-
-      if (part.text.length > 800000 || numTokens > 200000) {
-        throw new Error("Content is too large to tokenize.");
-      }
-
-      numTokens += encoder.encode(part.text.normalize("NFKC"), "all").length;
    }
  }

@@ -62,3 +69,48 @@ function getTokenCountForMessages(messages: AnthropicChatMessage[]) {

  return { tokenizer: "@anthropic-ai/tokenizer", token_count: numTokens };
 }
+
+async function getImageTokenCount(b64: string) {
+  // https://docs.anthropic.com/claude/docs/vision
+  // If your image's long edge is more than 1568 pixels, or your image is more
+  // than ~1600 tokens, it will first be scaled down, preserving aspect ratio,
+  // until it is within size limits. Assuming your image does not need to be
+  // resized, you can estimate the number of tokens used via this simple
+  // algorithm:
+  // tokens = (width px * height px)/750
+
+  const buffer = Buffer.from(b64, "base64");
+  const image = libSharp(buffer);
+  const metadata = await image.metadata();
+
+  if (!metadata || !metadata.width || !metadata.height) {
+    throw new Error("Prompt includes an image that could not be parsed");
+  }
+
+  const MAX_TOKENS = 1600;
+  const MAX_LENGTH_PX = 1568;
+  const PIXELS_PER_TOKEN = 750;
+  const { width, height } = metadata;
+  let tokens = (width * height) / PIXELS_PER_TOKEN;
+
+  // Resize the image if it's too large
+  if (tokens > MAX_TOKENS || width > MAX_LENGTH_PX || height > MAX_LENGTH_PX) {
+    const longestEdge = Math.max(width, height);
+
+    let factor;
+    if (tokens > MAX_TOKENS) {
+      const targetPixels = PIXELS_PER_TOKEN * MAX_TOKENS;
+      factor = Math.sqrt(targetPixels / (width * height));
+    } else {
+      factor = MAX_LENGTH_PX / longestEdge;
+    }
+
+    const scaledWidth = width * factor;
+    const scaledHeight = height * factor;
+
+    tokens = (scaledWidth * scaledHeight) / 750;
+  }
+
+  log.debug({ width, height, tokens }, "Calculated Claude Vision token cost");
+  return Math.ceil(tokens);
+}
@@ -99,13 +99,9 @@ export async function countTokens({
  const time = process.hrtime();
  switch (service) {
    case "anthropic-chat":
-      return {
-        ...getClaudeTokenCount(prompt ?? completion),
-        tokenization_duration_ms: getElapsedMs(time),
-      };
    case "anthropic-text":
      return {
-        ...getClaudeTokenCount(prompt ?? completion),
+        ...(await getClaudeTokenCount(prompt ?? completion)),
        tokenization_duration_ms: getElapsedMs(time),
      };
    case "openai":