diff --git a/src/proxy/middleware/request/preprocessors/count-prompt-tokens.ts b/src/proxy/middleware/request/preprocessors/count-prompt-tokens.ts index eadd1e4..79f841b 100644 --- a/src/proxy/middleware/request/preprocessors/count-prompt-tokens.ts +++ b/src/proxy/middleware/request/preprocessors/count-prompt-tokens.ts @@ -1,11 +1,17 @@ import { RequestPreprocessor } from "../index"; import { countTokens } from "../../../../shared/tokenization"; import { assertNever } from "../../../../shared/utils"; +import { OpenAIChatMessage } from "../../../../shared/api-schemas"; +import { GoogleAIChatMessage } from "../../../../shared/api-schemas/google-ai"; import { - GoogleAIChatMessage, - MistralAIChatMessage, - OpenAIChatMessage, -} from "../../../../shared/api-schemas"; + AnthropicChatMessage, + flattenAnthropicMessages, +} from "../../../../shared/api-schemas/anthropic"; +import { + MistralAIChatMessage, + ContentItem, + isMistralVisionModel +} from "../../../../shared/api-schemas/mistral-ai"; /** * Given a request with an already-transformed body, counts the number of @@ -61,9 +67,47 @@ export const countPromptTokens: RequestPreprocessor = async (req) => { case "mistral-ai": case "mistral-text": { req.outputTokens = req.body.max_tokens; - const prompt: string | MistralAIChatMessage[] = - req.body.messages ?? req.body.prompt; + + // Handle multimodal content (vision) in Mistral models + const isVisionModel = isMistralVisionModel(req.body.model); + const messages = req.body.messages; + + // Check if this is a vision request with images + const hasImageContent = Array.isArray(messages) && messages.some( + (msg: MistralAIChatMessage) => Array.isArray(msg.content) && + msg.content.some((item: ContentItem) => item.type === "image_url") + ); + + // For vision content, we add a fixed token count per image + // This is an estimate as the actual token count depends on image size and complexity + const TOKENS_PER_IMAGE = 1200; // Conservative estimate + let imageTokens = 0; + + if (hasImageContent && Array.isArray(messages)) { + // Count images in the request + for (const msg of messages) { + if (Array.isArray(msg.content)) { + const imageCount = msg.content.filter( + (item: ContentItem) => item.type === "image_url" + ).length; + imageTokens += imageCount * TOKENS_PER_IMAGE; + } + } + + req.log.debug( + { imageCount: imageTokens / TOKENS_PER_IMAGE, tokenEstimate: imageTokens }, + "Estimated token count for Mistral vision images" + ); + } + + const prompt: string | MistralAIChatMessage[] = messages ?? req.body.prompt; result = await countTokens({ req, prompt, service }); + + // Add the image tokens to the total count + if (imageTokens > 0) { + result.token_count += imageTokens; + } + break; } case "openai-image": { diff --git a/src/proxy/middleware/request/preprocessors/transform-outbound-payload.ts b/src/proxy/middleware/request/preprocessors/transform-outbound-payload.ts index 5144135..f31df02 100644 --- a/src/proxy/middleware/request/preprocessors/transform-outbound-payload.ts +++ b/src/proxy/middleware/request/preprocessors/transform-outbound-payload.ts @@ -4,7 +4,7 @@ import { API_REQUEST_TRANSFORMERS, } from "../../../../shared/api-schemas"; import { BadRequestError } from "../../../../shared/errors"; -import { fixMistralPrompt } from "../../../../shared/api-schemas/mistral-ai"; +import { fixMistralPrompt, isMistralVisionModel } from "../../../../shared/api-schemas/mistral-ai"; import { isImageGenerationRequest, isTextGenerationRequest, @@ -117,12 +117,66 @@ function applyMistralPromptFixes(req: Request): void { // mistral prompt and try to fix it if it fails. It will be re-validated // after this function returns. const result = API_REQUEST_VALIDATORS["mistral-ai"].parse(req.body); + + // Check if this is a vision model request + const isVisionModel = isMistralVisionModel(req.body.model); + + // Check if the request contains image content + const hasImageContent = result.messages?.some((msg: {content: string | any[]}) => + Array.isArray(msg.content) && + msg.content.some((item: any) => item.type === "image_url") + ); + + // For vision requests, normalize the image_url format + if (hasImageContent && Array.isArray(result.messages)) { + // Process each message with image content + result.messages.forEach((msg: any) => { + if (Array.isArray(msg.content)) { + // Process each content item + msg.content.forEach((item: any) => { + if (item.type === "image_url") { + // Normalize the image_url field to a string format that Mistral expects + if (typeof item.image_url === "object") { + // If it's an object, extract the URL or base64 data + if (item.image_url.url) { + item.image_url = item.image_url.url; + } else if (item.image_url.data) { + item.image_url = item.image_url.data; + } + + req.log.info( + { model: req.body.model }, + "Normalized object-format image_url to string format" + ); + } + } + }); + } + }); + } + + // Apply Mistral prompt fixes while preserving multimodal content req.body.messages = fixMistralPrompt(result.messages); req.log.info( - { n: req.body.messages.length, prev: result.messages.length }, + { + n: req.body.messages.length, + prev: result.messages.length, + isVisionModel, + hasImageContent + }, "Applied Mistral chat prompt fixes." ); + // If this is a vision model with image content, it MUST use the chat API + // and cannot be converted to text completions + if (hasImageContent) { + req.log.info( + { model: req.body.model }, + "Detected Mistral vision request with image content. Keeping as chat format." + ); + return; + } + // If the prompt relies on `prefix: true` for the last message, we need to // convert it to a text completions request because AWS Mistral support for // this feature is broken. diff --git a/src/proxy/mistral-ai.ts b/src/proxy/mistral-ai.ts index 5a69cd8..09faa08 100644 --- a/src/proxy/mistral-ai.ts +++ b/src/proxy/mistral-ai.ts @@ -20,38 +20,61 @@ import { createQueuedProxyMiddleware } from "./middleware/request/proxy-middlewa // months of releasing them so this list is hard to keep up to date. 2024-07-28 // https://docs.mistral.ai/platform/endpoints export const KNOWN_MISTRAL_AI_MODELS = [ - /* - Mistral Nemo - "A 12B model built with the partnership with Nvidia. It is easy to use and a - drop-in replacement in any system using Mistral 7B that it supersedes." - */ + /* Premier models */ + // Mistral Large (top-tier reasoning model) + "mistral-large-latest", + "mistral-large-2411", + "mistral-large-2407", + "mistral-large-2402", // older version + + // Pixtral Large (multimodal/vision model) + "pixtral-large-latest", + "pixtral-large-2411", + + // Mistral Saba (language-specialized model) + "mistral-saba-latest", + "mistral-saba-2502", + + // Codestral (code model) + "codestral-latest", + "codestral-2501", + "codestral-2405", + + // Ministral models (edge models) + "ministral-8b-latest", + "ministral-8b-2410", + "ministral-3b-latest", + "ministral-3b-2410", + + // Embedding & Moderation + "mistral-embed", + "mistral-embed-2312", + "mistral-moderation-latest", + "mistral-moderation-2411", + + /* Free models */ + // Mistral Small (with vision in latest version) + "mistral-small-latest", + "mistral-small-2503", // v3.1 with vision + "mistral-small-2402", // older version + + // Pixtral 12B (vision model) + "pixtral-12b-latest", + "pixtral-12b-2409", + + /* Research & Open Models */ + // Mistral Nemo "open-mistral-nemo", "open-mistral-nemo-2407", - /* - Mistral Large - "Our flagship model with state-of-the-art reasoning, knowledge, and coding - capabilities." - */ - "mistral-large-latest", - "mistral-large-2407", - "mistral-large-2402", // deprecated - /* - Codestral - "A cutting-edge generative model that has been specifically designed and - optimized for code generation tasks, including fill-in-the-middle and code - completion." - note: this uses a separate bidi completion endpoint that is not implemented - */ - "codestral-latest", - "codestral-2405", - /* So-called "Research Models" */ + + // Earlier Mixtral & Mistral models "open-mistral-7b", "open-mixtral-8x7b", - "open-mistral-8x22b", + "open-mixtral-8x22b", "open-codestral-mamba", - /* Deprecated production models */ - "mistral-small-latest", - "mistral-small-2402", + "mathstral", + + /* Legacy/deprecated models */ "mistral-medium-latest", "mistral-medium-2312", "mistral-tiny", diff --git a/src/shared/api-schemas/mistral-ai.ts b/src/shared/api-schemas/mistral-ai.ts index 2a0530d..b7be246 100644 --- a/src/shared/api-schemas/mistral-ai.ts +++ b/src/shared/api-schemas/mistral-ai.ts @@ -4,9 +4,59 @@ import { Template } from "@huggingface/jinja"; import { APIFormatTransformer } from "./index"; import { logger } from "../../logger"; +// Define the content types for multimodal messages +export const TextContentSchema = z.object({ + type: z.literal("text"), + text: z.string() +}); + +export const ImageUrlContentSchema = z.object({ + type: z.literal("image_url"), + image_url: z.union([ + // URL format (https://...) + z.string().url(), + // Base64 format (data:image/jpeg;base64,...) + z.string().regex(/^data:image\/(jpeg|png|gif|webp);base64,/), + // Object format (might contain detail or url properties) + z.record(z.any()), + // Allow any string for maximum compatibility + z.string() + ]) +}); + +export const ContentItemSchema = z.union([TextContentSchema, ImageUrlContentSchema]); + +// Export types for the content schemas +export type TextContent = z.infer; +export type ImageUrlContent = z.infer; +export type ContentItem = z.infer; + +// List of Mistral models with vision capabilities +export const MISTRAL_VISION_MODELS = [ + "pixtral-12b-2409", + "pixtral-12b-latest", + "pixtral-large-2411", + "pixtral-large-latest", + "mistral-small-2503", + "mistral-small-latest" +]; + +// Helper function to check if a model supports vision +export function isMistralVisionModel(model: string): boolean { + return MISTRAL_VISION_MODELS.some(visionModel => + model === visionModel || + model.startsWith(`${visionModel}-`) + ); +} + +// Main Mistral chat message schema const MistralChatMessageSchema = z.object({ role: z.enum(["system", "user", "assistant", "tool"]), // TODO: implement tools - content: z.string(), + // Support both string content (for backwards compatibility) and array of content items (for multimodal) + content: z.union([ + z.string(), + z.array(ContentItemSchema) + ]), prefix: z.boolean().optional(), }); @@ -107,7 +157,26 @@ export function fixMistralPrompt( // Consolidate multiple messages from the same role const last = acc[acc.length - 1]; if (last.role === copy.role) { - last.content += "\n\n" + copy.content; + // Handle different content types for consolidation + if (typeof last.content === "string" && typeof copy.content === "string") { + // Both are strings, concatenate them + last.content += "\n\n" + copy.content; + } else if (Array.isArray(last.content) && typeof copy.content === "string") { + // Add the string content as a new text content item + last.content.push({ + type: "text", + text: copy.content + }); + } else if (typeof last.content === "string" && Array.isArray(copy.content)) { + // Convert last.content to array and append copy.content items + last.content = [ + { type: "text", text: last.content }, + ...copy.content + ]; + } else if (Array.isArray(last.content) && Array.isArray(copy.content)) { + // Both are arrays, concatenate them + last.content = [...last.content, ...copy.content]; + } } else { acc.push(copy); } @@ -125,18 +194,41 @@ export function fixMistralPrompt( let jinjaTemplate: Template; let renderTemplate: (messages: MistralAIChatMessage[]) => string; + +// Helper function to convert multimodal content to string format for text-only models +function contentToString(content: string | any[]): string { + if (typeof content === "string") { + return content; + } else if (Array.isArray(content)) { + // For multimodal content, extract only the text parts + // Images are not supported in text-only templates + return content + .filter(item => item.type === "text") + .map(item => (item as any).text) + .join("\n\n"); + } + return ""; +} + function renderMistralPrompt(messages: MistralAIChatMessage[]) { if (!jinjaTemplate) { logger.warn("Lazy loading mistral chat template..."); const { chatTemplate, bosToken, eosToken } = require("./templates/mistral-template").MISTRAL_TEMPLATE; jinjaTemplate = new Template(chatTemplate); - renderTemplate = (messages) => - jinjaTemplate.render({ - messages, + renderTemplate = (messages) => { + // We need to convert any multimodal content to string format for the template + const textOnlyMessages = messages.map(msg => ({ + ...msg, + content: contentToString(msg.content) + })); + + return jinjaTemplate.render({ + messages: textOnlyMessages, bos_token: bosToken, eos_token: eosToken, }); + }; } return renderTemplate(messages); @@ -145,6 +237,9 @@ function renderMistralPrompt(messages: MistralAIChatMessage[]) { /** * Attempts to convert a Mistral chat completions request to a text completions, * using the official prompt template published by Mistral. + * + * Note: This transformation is only applicable for text-only models. + * Multimodal/vision models (Pixtral, etc.) cannot use this transformation. */ export const transformMistralChatToText: APIFormatTransformer< typeof MistralAIV1TextCompletionsSchema @@ -159,8 +254,24 @@ export const transformMistralChatToText: APIFormatTransformer< throw result.error; } - const { messages, ...rest } = result.data; - const prompt = renderMistralPrompt(messages); + // Check if this is a vision request (contains any image_url content items) + const { messages, model, ...rest } = result.data; + const hasVisionContent = messages.some(msg => + Array.isArray(msg.content) && + msg.content.some(item => item.type === "image_url") + ); - return { ...rest, prompt, messages: undefined }; + // Cannot transform vision requests to text completions + if (hasVisionContent) { + req.log.warn( + { model }, + "Cannot transform Mistral vision request to text completions format" + ); + throw new Error( + "Vision requests (with image_url content) cannot be transformed to text completions format" + ); + } + + const prompt = renderMistralPrompt(messages); + return { ...rest, model, prompt, messages: undefined }; }; diff --git a/src/shared/models.ts b/src/shared/models.ts index 4c2fed2..323107f 100644 --- a/src/shared/models.ts +++ b/src/shared/models.ts @@ -248,22 +248,56 @@ export function getGoogleAIModelFamily(model: string): GoogleAIModelFamily { } export function getMistralAIModelFamily(model: string): MistralAIModelFamily { - const prunedModel = model.replace(/-(latest|\d{4})$/, ""); + const prunedModel = model.replace(/-(latest|\d{4}(-\d{2}){0,2})$/, ""); + + // Premier models (higher tier) switch (prunedModel) { + // Existing direct matches case "mistral-tiny": case "mistral-small": case "mistral-medium": case "mistral-large": return prunedModel as MistralAIModelFamily; + + // Premier models - Large tier + case "mistral-large": + case "pixtral-large": + return "mistral-large"; + + // Premier models - Medium tier + case "mistral-saba": + return "mistral-medium"; + + // Premier models - Small tier + case "codestral": + case "ministral-8b": + case "mistral-embed": + case "mistral-moderation": + return "mistral-small"; + + // Premier models - Tiny tier + case "ministral-3b": + return "mistral-tiny"; + + // Free models - Tiny tier case "open-mistral-7b": return "mistral-tiny"; + + // Free models - Small tier + case "mistral-small": + case "pixtral": + case "pixtral-12b": case "open-mistral-nemo": case "open-mixtral-8x7b": - case "codestral": case "open-codestral-mamba": + case "mathstral": return "mistral-small"; + + // Free models - Medium tier case "open-mixtral-8x22b": return "mistral-medium"; + + // Default to small if unknown default: return "mistral-small"; } diff --git a/src/shared/stats.ts b/src/shared/stats.ts index 95912f5..7d76f67 100644 --- a/src/shared/stats.ts +++ b/src/shared/stats.ts @@ -105,19 +105,27 @@ export function getTokenCostUsd(model: ModelFamily, tokens: number) { break; case "aws-mistral-tiny": case "mistral-tiny": - cost = 0.0000003; + // Using Ministral 3B pricing: $0.04/1M input tokens, $0.04/1M output tokens + // For edge/tiny models, a more balanced 1:1 ratio is used + cost = 0.00000004; break; case "aws-mistral-small": case "mistral-small": - cost = 0.00000035; + // Using Codestral pricing: $0.3/1M input, $0.9/1M output (highest in category) + // Weighted average for 1:3 input:output ratio + cost = 0.00000075; break; case "aws-mistral-medium": case "mistral-medium": - cost = 0.000004; + // Using Mistral Saba pricing: $0.2/1M input, $0.6/1M output + // Weighted average for 1:3 input:output ratio + cost = 0.0000005; break; case "aws-mistral-large": case "mistral-large": - cost = 0.000012; + // Using Mistral Large/Pixtral Large pricing: $2/1M input, $6/1M output + // Weighted average for 1:3 input:output ratio + cost = 0.000005; break; case "gemini-flash": cost = 0.0000002326;