mistral update

2025-04-30 20:03:40 +02:00
parent 80d09f470b
commit c1cb395020
6 changed files with 323 additions and 49 deletions
@@ -1,11 +1,17 @@
 import { RequestPreprocessor } from "../index";
 import { countTokens } from "../../../../shared/tokenization";
 import { assertNever } from "../../../../shared/utils";
+import { OpenAIChatMessage } from "../../../../shared/api-schemas";
+import { GoogleAIChatMessage } from "../../../../shared/api-schemas/google-ai";
 import {
-  GoogleAIChatMessage,
-  MistralAIChatMessage,
-  OpenAIChatMessage,
-} from "../../../../shared/api-schemas";
+  AnthropicChatMessage,
+  flattenAnthropicMessages,
+} from "../../../../shared/api-schemas/anthropic";
+import { 
+  MistralAIChatMessage, 
+  ContentItem,
+  isMistralVisionModel 
+} from "../../../../shared/api-schemas/mistral-ai";

 /**
 * Given a request with an already-transformed body, counts the number of
@@ -61,9 +67,47 @@ export const countPromptTokens: RequestPreprocessor = async (req) => {
    case "mistral-ai":
    case "mistral-text": {
      req.outputTokens = req.body.max_tokens;
-      const prompt: string | MistralAIChatMessage[] =
-        req.body.messages ?? req.body.prompt;
+      
+      // Handle multimodal content (vision) in Mistral models
+      const isVisionModel = isMistralVisionModel(req.body.model);
+      const messages = req.body.messages;
+      
+      // Check if this is a vision request with images
+      const hasImageContent = Array.isArray(messages) && messages.some(
+        (msg: MistralAIChatMessage) => Array.isArray(msg.content) && 
+          msg.content.some((item: ContentItem) => item.type === "image_url")
+      );
+      
+      // For vision content, we add a fixed token count per image
+      // This is an estimate as the actual token count depends on image size and complexity
+      const TOKENS_PER_IMAGE = 1200; // Conservative estimate
+      let imageTokens = 0;
+      
+      if (hasImageContent && Array.isArray(messages)) {
+        // Count images in the request
+        for (const msg of messages) {
+          if (Array.isArray(msg.content)) {
+            const imageCount = msg.content.filter(
+              (item: ContentItem) => item.type === "image_url"
+            ).length;
+            imageTokens += imageCount * TOKENS_PER_IMAGE;
+          }
+        }
+        
+        req.log.debug(
+          { imageCount: imageTokens / TOKENS_PER_IMAGE, tokenEstimate: imageTokens },
+          "Estimated token count for Mistral vision images"
+        );
+      }
+      
+      const prompt: string | MistralAIChatMessage[] = messages ?? req.body.prompt;
      result = await countTokens({ req, prompt, service });
+      
+      // Add the image tokens to the total count
+      if (imageTokens > 0) {
+        result.token_count += imageTokens;
+      }
+      
      break;
    }
    case "openai-image": {
@@ -4,7 +4,7 @@ import {
  API_REQUEST_TRANSFORMERS,
 } from "../../../../shared/api-schemas";
 import { BadRequestError } from "../../../../shared/errors";
-import { fixMistralPrompt } from "../../../../shared/api-schemas/mistral-ai";
+import { fixMistralPrompt, isMistralVisionModel } from "../../../../shared/api-schemas/mistral-ai";
 import {
  isImageGenerationRequest,
  isTextGenerationRequest,
@@ -117,12 +117,66 @@ function applyMistralPromptFixes(req: Request): void {
    // mistral prompt and try to fix it if it fails. It will be re-validated
    // after this function returns.
    const result = API_REQUEST_VALIDATORS["mistral-ai"].parse(req.body);
+    
+    // Check if this is a vision model request
+    const isVisionModel = isMistralVisionModel(req.body.model);
+    
+    // Check if the request contains image content
+    const hasImageContent = result.messages?.some((msg: {content: string | any[]}) => 
+      Array.isArray(msg.content) && 
+      msg.content.some((item: any) => item.type === "image_url")
+    );
+    
+    // For vision requests, normalize the image_url format
+    if (hasImageContent && Array.isArray(result.messages)) {
+      // Process each message with image content
+      result.messages.forEach((msg: any) => {
+        if (Array.isArray(msg.content)) {
+          // Process each content item
+          msg.content.forEach((item: any) => {
+            if (item.type === "image_url") {
+              // Normalize the image_url field to a string format that Mistral expects
+              if (typeof item.image_url === "object") {
+                // If it's an object, extract the URL or base64 data
+                if (item.image_url.url) {
+                  item.image_url = item.image_url.url;
+                } else if (item.image_url.data) {
+                  item.image_url = item.image_url.data;
+                }
+                
+                req.log.info(
+                  { model: req.body.model },
+                  "Normalized object-format image_url to string format"
+                );
+              }
+            }
+          });
+        }
+      });
+    }
+    
+    // Apply Mistral prompt fixes while preserving multimodal content
    req.body.messages = fixMistralPrompt(result.messages);
    req.log.info(
-      { n: req.body.messages.length, prev: result.messages.length },
+      { 
+        n: req.body.messages.length, 
+        prev: result.messages.length,
+        isVisionModel,
+        hasImageContent 
+      },
      "Applied Mistral chat prompt fixes."
    );

+    // If this is a vision model with image content, it MUST use the chat API
+    // and cannot be converted to text completions
+    if (hasImageContent) {
+      req.log.info(
+        { model: req.body.model },
+        "Detected Mistral vision request with image content. Keeping as chat format."
+      );
+      return;
+    }
+
    // If the prompt relies on `prefix: true` for the last message, we need to
    // convert it to a text completions request because AWS Mistral support for
    // this feature is broken.
@@ -20,38 +20,61 @@ import { createQueuedProxyMiddleware } from "./middleware/request/proxy-middlewa
 // months of releasing them so this list is hard to keep up to date. 2024-07-28
 // https://docs.mistral.ai/platform/endpoints
 export const KNOWN_MISTRAL_AI_MODELS = [
-  /*
-  Mistral Nemo
-  "A 12B model built with the partnership with Nvidia.  It is easy to use and a
-  drop-in replacement in any system using Mistral 7B that it supersedes."
-  */
+  /* Premier models */
+  // Mistral Large (top-tier reasoning model)
+  "mistral-large-latest",
+  "mistral-large-2411", 
+  "mistral-large-2407",
+  "mistral-large-2402", // older version
+  
+  // Pixtral Large (multimodal/vision model)
+  "pixtral-large-latest",
+  "pixtral-large-2411",
+  
+  // Mistral Saba (language-specialized model)
+  "mistral-saba-latest",
+  "mistral-saba-2502",
+  
+  // Codestral (code model)
+  "codestral-latest",
+  "codestral-2501",
+  "codestral-2405",
+  
+  // Ministral models (edge models)
+  "ministral-8b-latest",
+  "ministral-8b-2410",
+  "ministral-3b-latest",
+  "ministral-3b-2410",
+  
+  // Embedding & Moderation
+  "mistral-embed",  
+  "mistral-embed-2312",
+  "mistral-moderation-latest", 
+  "mistral-moderation-2411",
+
+  /* Free models */
+  // Mistral Small (with vision in latest version)
+  "mistral-small-latest",
+  "mistral-small-2503", // v3.1 with vision
+  "mistral-small-2402", // older version
+  
+  // Pixtral 12B (vision model)
+  "pixtral-12b-latest",
+  "pixtral-12b-2409",
+  
+  /* Research & Open Models */
+  // Mistral Nemo
  "open-mistral-nemo",
  "open-mistral-nemo-2407",
-  /*
-  Mistral Large
-  "Our flagship model with state-of-the-art reasoning, knowledge, and coding
-  capabilities."
-  */
-  "mistral-large-latest",
-  "mistral-large-2407",
-  "mistral-large-2402", // deprecated
-  /*
-  Codestral
-  "A cutting-edge generative model that has been specifically designed and
-  optimized for code generation tasks, including fill-in-the-middle and code
-  completion."
-  note: this uses a separate bidi completion endpoint that is not implemented
-  */
-  "codestral-latest",
-  "codestral-2405",
-  /* So-called "Research Models" */
+  
+  // Earlier Mixtral & Mistral models
  "open-mistral-7b",
  "open-mixtral-8x7b",
-  "open-mistral-8x22b",
+  "open-mixtral-8x22b", 
  "open-codestral-mamba",
-  /* Deprecated production models */
-  "mistral-small-latest",
-  "mistral-small-2402",
+  "mathstral",
+  
+  /* Legacy/deprecated models */
  "mistral-medium-latest",
  "mistral-medium-2312",
  "mistral-tiny",
@@ -4,9 +4,59 @@ import { Template } from "@huggingface/jinja";
 import { APIFormatTransformer } from "./index";
 import { logger } from "../../logger";

+// Define the content types for multimodal messages
+export const TextContentSchema = z.object({
+  type: z.literal("text"),
+  text: z.string()
+});
+
+export const ImageUrlContentSchema = z.object({
+  type: z.literal("image_url"),
+  image_url: z.union([
+    // URL format (https://...)
+    z.string().url(),
+    // Base64 format (data:image/jpeg;base64,...)
+    z.string().regex(/^data:image\/(jpeg|png|gif|webp);base64,/),
+    // Object format (might contain detail or url properties)
+    z.record(z.any()),
+    // Allow any string for maximum compatibility
+    z.string()
+  ])
+});
+
+export const ContentItemSchema = z.union([TextContentSchema, ImageUrlContentSchema]);
+
+// Export types for the content schemas
+export type TextContent = z.infer<typeof TextContentSchema>;
+export type ImageUrlContent = z.infer<typeof ImageUrlContentSchema>;
+export type ContentItem = z.infer<typeof ContentItemSchema>;
+
+// List of Mistral models with vision capabilities
+export const MISTRAL_VISION_MODELS = [
+  "pixtral-12b-2409",
+  "pixtral-12b-latest",
+  "pixtral-large-2411",
+  "pixtral-large-latest",
+  "mistral-small-2503",
+  "mistral-small-latest"
+];
+
+// Helper function to check if a model supports vision
+export function isMistralVisionModel(model: string): boolean {
+  return MISTRAL_VISION_MODELS.some(visionModel => 
+    model === visionModel || 
+    model.startsWith(`${visionModel}-`)
+  );
+}
+
+// Main Mistral chat message schema
 const MistralChatMessageSchema = z.object({
  role: z.enum(["system", "user", "assistant", "tool"]), // TODO: implement tools
-  content: z.string(),
+  // Support both string content (for backwards compatibility) and array of content items (for multimodal)
+  content: z.union([
+    z.string(),
+    z.array(ContentItemSchema)
+  ]),
  prefix: z.boolean().optional(),
 });

@@ -107,7 +157,26 @@ export function fixMistralPrompt(
    // Consolidate multiple messages from the same role
    const last = acc[acc.length - 1];
    if (last.role === copy.role) {
-      last.content += "\n\n" + copy.content;
+      // Handle different content types for consolidation
+      if (typeof last.content === "string" && typeof copy.content === "string") {
+        // Both are strings, concatenate them
+        last.content += "\n\n" + copy.content;
+      } else if (Array.isArray(last.content) && typeof copy.content === "string") {
+        // Add the string content as a new text content item
+        last.content.push({
+          type: "text",
+          text: copy.content
+        });
+      } else if (typeof last.content === "string" && Array.isArray(copy.content)) {
+        // Convert last.content to array and append copy.content items
+        last.content = [
+          { type: "text", text: last.content },
+          ...copy.content
+        ];
+      } else if (Array.isArray(last.content) && Array.isArray(copy.content)) {
+        // Both are arrays, concatenate them
+        last.content = [...last.content, ...copy.content];
+      }
    } else {
      acc.push(copy);
    }
@@ -125,18 +194,41 @@ export function fixMistralPrompt(

 let jinjaTemplate: Template;
 let renderTemplate: (messages: MistralAIChatMessage[]) => string;
+
+// Helper function to convert multimodal content to string format for text-only models
+function contentToString(content: string | any[]): string {
+  if (typeof content === "string") {
+    return content;
+  } else if (Array.isArray(content)) {
+    // For multimodal content, extract only the text parts
+    // Images are not supported in text-only templates
+    return content
+      .filter(item => item.type === "text")
+      .map(item => (item as any).text)
+      .join("\n\n");
+  }
+  return "";
+}
+
 function renderMistralPrompt(messages: MistralAIChatMessage[]) {
  if (!jinjaTemplate) {
    logger.warn("Lazy loading mistral chat template...");
    const { chatTemplate, bosToken, eosToken } =
      require("./templates/mistral-template").MISTRAL_TEMPLATE;
    jinjaTemplate = new Template(chatTemplate);
-    renderTemplate = (messages) =>
-      jinjaTemplate.render({
-        messages,
+    renderTemplate = (messages) => {
+      // We need to convert any multimodal content to string format for the template
+      const textOnlyMessages = messages.map(msg => ({
+        ...msg,
+        content: contentToString(msg.content)
+      }));
+      
+      return jinjaTemplate.render({
+        messages: textOnlyMessages,
        bos_token: bosToken,
        eos_token: eosToken,
      });
+    };
  }

  return renderTemplate(messages);
@@ -145,6 +237,9 @@ function renderMistralPrompt(messages: MistralAIChatMessage[]) {
 /**
 * Attempts to convert a Mistral chat completions request to a text completions,
 * using the official prompt template published by Mistral.
+ * 
+ * Note: This transformation is only applicable for text-only models.
+ * Multimodal/vision models (Pixtral, etc.) cannot use this transformation.
 */
 export const transformMistralChatToText: APIFormatTransformer<
  typeof MistralAIV1TextCompletionsSchema
@@ -159,8 +254,24 @@ export const transformMistralChatToText: APIFormatTransformer<
    throw result.error;
  }

-  const { messages, ...rest } = result.data;
-  const prompt = renderMistralPrompt(messages);
+  // Check if this is a vision request (contains any image_url content items)
+  const { messages, model, ...rest } = result.data;
+  const hasVisionContent = messages.some(msg => 
+    Array.isArray(msg.content) && 
+    msg.content.some(item => item.type === "image_url")
+  );

-  return { ...rest, prompt, messages: undefined };
+  // Cannot transform vision requests to text completions
+  if (hasVisionContent) {
+    req.log.warn(
+      { model },
+      "Cannot transform Mistral vision request to text completions format"
+    );
+    throw new Error(
+      "Vision requests (with image_url content) cannot be transformed to text completions format"
+    );
+  }
+
+  const prompt = renderMistralPrompt(messages);
+  return { ...rest, model, prompt, messages: undefined };
 };
@@ -248,22 +248,56 @@ export function getGoogleAIModelFamily(model: string): GoogleAIModelFamily {
 }

 export function getMistralAIModelFamily(model: string): MistralAIModelFamily {
-  const prunedModel = model.replace(/-(latest|\d{4})$/, "");
+  const prunedModel = model.replace(/-(latest|\d{4}(-\d{2}){0,2})$/, "");
+  
+  // Premier models (higher tier)
  switch (prunedModel) {
+    // Existing direct matches
    case "mistral-tiny":
    case "mistral-small":
    case "mistral-medium":
    case "mistral-large":
      return prunedModel as MistralAIModelFamily;
+      
+    // Premier models - Large tier
+    case "mistral-large":
+    case "pixtral-large":
+      return "mistral-large";
+      
+    // Premier models - Medium tier
+    case "mistral-saba":
+      return "mistral-medium";
+      
+    // Premier models - Small tier
+    case "codestral":
+    case "ministral-8b":
+    case "mistral-embed":
+    case "mistral-moderation":
+      return "mistral-small";
+    
+    // Premier models - Tiny tier
+    case "ministral-3b":
+      return "mistral-tiny";
+      
+    // Free models - Tiny tier
    case "open-mistral-7b":
      return "mistral-tiny";
+      
+    // Free models - Small tier
+    case "mistral-small":
+    case "pixtral":
+    case "pixtral-12b":
    case "open-mistral-nemo":
    case "open-mixtral-8x7b":
-    case "codestral":
    case "open-codestral-mamba":
+    case "mathstral":
      return "mistral-small";
+    
+    // Free models - Medium tier
    case "open-mixtral-8x22b":
      return "mistral-medium";
+      
+    // Default to small if unknown
    default:
      return "mistral-small";
  }
@@ -105,19 +105,27 @@ export function getTokenCostUsd(model: ModelFamily, tokens: number) {
      break;
    case "aws-mistral-tiny":
    case "mistral-tiny":
-      cost = 0.0000003;
+      // Using Ministral 3B pricing: $0.04/1M input tokens, $0.04/1M output tokens
+      // For edge/tiny models, a more balanced 1:1 ratio is used
+      cost = 0.00000004;
      break;
    case "aws-mistral-small":
    case "mistral-small":
-      cost = 0.00000035;
+      // Using Codestral pricing: $0.3/1M input, $0.9/1M output (highest in category)
+      // Weighted average for 1:3 input:output ratio
+      cost = 0.00000075;
      break;
    case "aws-mistral-medium":
    case "mistral-medium":
-      cost = 0.000004;
+      // Using Mistral Saba pricing: $0.2/1M input, $0.6/1M output
+      // Weighted average for 1:3 input:output ratio
+      cost = 0.0000005;
      break;
    case "aws-mistral-large":
    case "mistral-large":
-      cost = 0.000012;
+      // Using Mistral Large/Pixtral Large pricing: $2/1M input, $6/1M output
+      // Weighted average for 1:3 input:output ratio
+      cost = 0.000005;
      break;
    case "gemini-flash":
      cost = 0.0000002326;