diff --git a/src/info-page.ts b/src/info-page.ts index dbe5838..cb5a3b7 100644 --- a/src/info-page.ts +++ b/src/info-page.ts @@ -30,6 +30,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = { "o3": "OpenAI o3", "o4-mini": "OpenAI o4 mini", "dall-e": "DALL-E", + "gpt-image": "GPT Image", claude: "Claude (Sonnet)", "claude-opus": "Claude (Opus)", "gemini-flash": "Gemini Flash", @@ -63,6 +64,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = { "azure-o3": "Azure o3", "azure-o4-mini": "Azure o4 mini", "azure-dall-e": "Azure DALL-E", + "azure-gpt-image": "Azure GPT Image", }; const converter = new showdown.Converter(); @@ -213,15 +215,15 @@ function getServerTitle() { } function buildRecentImageSection() { - const dalleModels: ModelFamily[] = ["azure-dall-e", "dall-e"]; + const imageModels: ModelFamily[] = ["azure-dall-e", "dall-e", "gpt-image", "azure-gpt-image"]; if ( !config.showRecentImages || - dalleModels.every((f) => !config.allowedModelFamilies.includes(f)) + imageModels.every((f) => !config.allowedModelFamilies.includes(f)) ) { return ""; } - let html = `

Recent DALL-E Generations

`; + let html = `

Recent Image Generations

`; const recentImages = getLastNImages(12).reverse(); if (recentImages.length === 0) { html += `

No images yet.

`; diff --git a/src/proxy/openai-image.ts b/src/proxy/openai-image.ts index 0ec7391..2813174 100644 --- a/src/proxy/openai-image.ts +++ b/src/proxy/openai-image.ts @@ -11,7 +11,7 @@ import { ProxyResHandlerWithBody } from "./middleware/response"; import { ProxyReqManager } from "./middleware/request/proxy-req-manager"; import { createQueuedProxyMiddleware } from "./middleware/request/proxy-middleware-factory"; -const KNOWN_MODELS = ["dall-e-2", "dall-e-3"]; +const KNOWN_MODELS = ["dall-e-2", "dall-e-3", "gpt-image-1"]; let modelListCache: any = null; let modelListValid = 0; @@ -58,27 +58,46 @@ function transformResponseForChat( req: Request ): Record { const prompt = imageBody.data[0].revised_prompt ?? req.body.prompt; + const isGptImage = req.body.model?.includes("gpt-image") || false; + const content = imageBody.data .map((item) => { const { url, b64_json } = item; + // The gpt-image-1 model always returns b64_json + // Format will depend on output_format parameter (defaults to png) + // For simplicity, we'll assume png if not specified + const format = req.body.output_format || "png"; + if (b64_json) { - return `![${prompt}](data:image/png;base64,${b64_json})`; + return `![${prompt}](data:image/${format};base64,${b64_json})`; } else { return `![${prompt}](${url})`; } }) .join("\n\n"); + // Prepare the usage information - gpt-image-1 includes detailed token usage + let usage = { + prompt_tokens: 0, + completion_tokens: req.outputTokens, + total_tokens: req.outputTokens, + }; + + // If this is a gpt-image-1 response, it includes detailed usage info + if (imageBody.usage) { + usage = { + prompt_tokens: imageBody.usage.input_tokens || 0, + completion_tokens: imageBody.usage.output_tokens || 0, + total_tokens: imageBody.usage.total_tokens || 0, + }; + } + return { - id: "dalle-" + req.id, + id: req.body.model?.includes("gpt-image") ? "gptimage-" + req.id : "dalle-" + req.id, object: "chat.completion", created: Date.now(), model: req.body.model, - usage: { - prompt_tokens: 0, - completion_tokens: req.outputTokens, - total_tokens: req.outputTokens, - }, + usage, choices: [ { message: { role: "assistant", content }, @@ -89,6 +108,56 @@ function transformResponseForChat( }; } +// Filter parameters based on the model being used to avoid sending unsupported parameters +function filterModelParameters(manager: ProxyReqManager) { + const req = manager.request; + const originalBody = req.body; + const modelName = originalBody?.model || ""; + + // Skip if no body or it's not an object + if (!originalBody || typeof originalBody !== 'object') return; + + // Create a deep copy of the body to filter + const filteredBody = { ...originalBody }; + + // Define allowed parameters for each model + if (modelName.includes('dall-e-2')) { + // DALL-E 2 parameters + const allowedParams = [ + 'model', 'prompt', 'n', 'size', 'response_format', 'user' + ]; + + // Remove any parameter not in the allowed list + Object.keys(filteredBody).forEach(key => { + if (!allowedParams.includes(key)) { + delete filteredBody[key]; + } + }); + + req.log.info({ model: 'dall-e-2', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 2"); + } else if (modelName.includes('dall-e-3')) { + // DALL-E 3 parameters + const allowedParams = [ + 'model', 'prompt', 'n', 'quality', 'size', 'style', 'response_format', 'user' + ]; + + // Remove any parameter not in the allowed list + Object.keys(filteredBody).forEach(key => { + if (!allowedParams.includes(key)) { + delete filteredBody[key]; + } + }); + + req.log.info({ model: 'dall-e-3', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 3"); + } else if (modelName.includes('gpt-image')) { + // For gpt-image-1, we can use all parameters + req.log.info({ model: 'gpt-image-1', params: Object.keys(filteredBody) }, "Using all parameters for GPT Image"); + } + + // Use the proper method to update the body + manager.setBody(filteredBody); +} + function replacePath(manager: ProxyReqManager) { const req = manager.request; const pathname = req.url.split("?")[0]; @@ -100,7 +169,7 @@ function replacePath(manager: ProxyReqManager) { const openaiImagesProxy = createQueuedProxyMiddleware({ target: "https://api.openai.com", - mutations: [replacePath, addKey, finalizeBody], + mutations: [replacePath, filterModelParameters, addKey, finalizeBody], blockingResponseHandler: openaiImagesResponseHandler, }); @@ -116,6 +185,17 @@ openaiImagesRouter.post( }), openaiImagesProxy ); +// Add support for the /v1/images/edits endpoint (used by gpt-image-1 for image editing) +openaiImagesRouter.post( + "/v1/images/edits", + ipLimiter, + createPreprocessorMiddleware({ + inApi: "openai-image", + outApi: "openai-image", + service: "openai", + }), + openaiImagesProxy +); openaiImagesRouter.post( "/v1/chat/completions", ipLimiter, diff --git a/src/shared/api-schemas/openai-image.ts b/src/shared/api-schemas/openai-image.ts index 7133362..bab379e 100644 --- a/src/shared/api-schemas/openai-image.ts +++ b/src/shared/api-schemas/openai-image.ts @@ -1,20 +1,58 @@ import { z } from "zod"; +import { Request } from "express"; import { OpenAIV1ChatCompletionSchema } from "./openai"; import { APIFormatTransformer } from "./index"; +// Extend the Express Request type to include multimodal content +declare global { + namespace Express { + interface Request { + multimodalContent?: { + prompt?: string; + images?: string[]; + }; + } + } +} + // https://platform.openai.com/docs/api-reference/images/create export const OpenAIV1ImagesGenerationSchema = z .object({ - prompt: z.string().max(4000), + prompt: z.string().max(32000), // gpt-image-1 supports up to 32000 chars model: z.string().max(100).optional(), - quality: z.enum(["standard", "hd"]).optional().default("standard"), - n: z.number().int().min(1).max(4).optional().default(1), - response_format: z.enum(["url", "b64_json"]).optional(), + // Support for image inputs (multimodal capability of gpt-image-1) + image: z.union([ + z.string(), // single image (base64 or URL) + z.array(z.string()) // array of images + ]).optional(), + mask: z.string().optional(), // mask image for editing + // Different quality options based on model + quality: z + .union([ + z.enum(["standard", "hd"]), // dall-e-3 options + z.enum(["high", "medium", "low"]), // gpt-image-1 options + z.literal("auto") // default for gpt-image-1 + ]) + .optional() + .default("standard"), + n: z.number().int().min(1).max(10).optional().default(1), // gpt-image-1 supports up to 10 + response_format: z.enum(["url", "b64_json"]).optional(), // Note: gpt-image-1 always returns b64_json + // Enhanced size options for gpt-image-1 size: z - .enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"]) + .union([ + // dalle models + z.enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"]), + // gpt-image-1 models (adds landscape, portrait, auto) + z.enum(["1024x1024", "1536x1024", "1024x1536", "auto"]) + ]) .optional() .default("1024x1024"), - style: z.enum(["vivid", "natural"]).optional().default("vivid"), + style: z.enum(["vivid", "natural"]).optional().default("vivid"), // dall-e-3 only + // New gpt-image-1 specific parameters + background: z.enum(["transparent", "opaque", "auto"]).optional(), // gpt-image-1 only + moderation: z.enum(["low", "auto"]).optional(), // gpt-image-1 only + output_compression: z.number().int().min(0).max(100).optional(), // gpt-image-1 only + output_format: z.enum(["png", "jpeg", "webp"]).optional(), // gpt-image-1 only user: z.string().max(500).optional(), }) .strip(); @@ -34,9 +72,41 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer< } const { messages } = result.data; - const prompt = messages.filter((m) => m.role === "user").pop()?.content; - if (Array.isArray(prompt)) { - throw new Error("Image generation prompt must be a text message."); + const userMessage = messages.filter((m) => m.role === "user").pop(); + if (!userMessage) { + throw new Error("No user message found in the request."); + } + + const content = userMessage.content; + + // Handle array content (multimodal content with text and images) + if (Array.isArray(content)) { + const textParts: string[] = []; + const imageParts: string[] = []; + + // Process content parts, extracting text and images + content.forEach(part => { + if (typeof part === 'string') { + textParts.push(part); + } else if (part.type === 'image_url') { + // Extract image URL or base64 data from the content + const imageUrl = typeof part.image_url === 'string' + ? part.image_url + : part.image_url.url; + imageParts.push(imageUrl); + } + }); + + // Join all text parts to form the prompt + const prompt = textParts.join('\n'); + + // For gpt-image-1, we'll pass both the text prompt and image(s) + req.multimodalContent = { + prompt, + images: imageParts + }; + } else if (typeof content !== 'string') { + throw new Error("Image generation prompt must be a text message or multimodal content."); } if (body.stream) { @@ -49,20 +119,172 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer< // character name or wrapping the entire thing in quotes. We will look for // the index of "Image:" and use everything after that as the prompt. - const index = prompt?.toLowerCase().indexOf("image:"); - if (index === -1 || !prompt) { - throw new Error( - `Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${prompt}).` - ); + // For multimodal requests (image editing with gpt-image-1), we don't require the "Image:" prefix + const isMultimodalRequest = Array.isArray(content) && req.multimodalContent?.images && req.multimodalContent.images.length > 0; + + // Only enforce the "Image:" prefix for non-multimodal requests + if (!isMultimodalRequest && typeof content === 'string') { + const textIndex = content.toLowerCase().indexOf("image:"); + if (textIndex === -1) { + throw new Error( + `Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${content}).` + ); + } } // TODO: Add some way to specify parameters via chat message - const transformed = { - model: body.model.includes("dall-e") ? body.model : "dall-e-3", - quality: "standard", - size: "1024x1024", - response_format: "url", - prompt: prompt.slice(index! + 6).trim(), + // Determine which model to use (gpt-image-1 or dall-e-3) + const isGptImage = body.model?.includes("gpt-image") || false; + + // Get the correct text prompt either from multimodal content or plain string content + let textPrompt: string | undefined; + let index = -1; + + if (Array.isArray(content)) { + textPrompt = req.multimodalContent?.prompt; + } else if (typeof content === 'string') { + index = content.toLowerCase().indexOf("image:"); + textPrompt = index !== -1 ? content.slice(index + 6).trim() : content; + } + + // Validate that we have a text prompt + if (!textPrompt) { + throw new Error("No text prompt found in the request."); + } + + // Determine the exact model being used + let modelName = "dall-e-2"; // Default + + if (isGptImage) { + modelName = "gpt-image-1"; + } else if (body.model?.includes("dall-e-3")) { + modelName = "dall-e-3"; + } else if (body.model?.includes("dall-e-2")) { + modelName = "dall-e-2"; + } else { + // If no specific model requested, default to dall-e-3 + modelName = "dall-e-3"; + } + + // Start with basic parameters common to all models + const transformed: any = { + model: modelName, + prompt: textPrompt, }; + + // Add model-specific parameters + if (modelName === "gpt-image-1") { + // GPT Image specific parameters + transformed.quality = "auto"; // Default quality for gpt-image-1 + transformed.size = "1024x1024"; // Default size (square) + transformed.moderation = "low"; // Always set moderation to low for gpt-image-1 + + // Optional GPT Image parameters + if (body.background) transformed.background = body.background; + if (body.output_format) transformed.output_format = body.output_format; + if (body.output_compression) transformed.output_compression = body.output_compression; + + // Handle specific quality settings for gpt-image-1 + if (body.quality && ["high", "medium", "low", "auto"].includes(body.quality)) { + transformed.quality = body.quality; + } + + // Handle specific size settings for gpt-image-1 + if (body.size && ["1024x1024", "1536x1024", "1024x1536", "auto"].includes(body.size)) { + transformed.size = body.size; + } + + // No response_format for gpt-image-1 as it always returns b64_json + } else if (modelName === "dall-e-3") { + // DALL-E 3 specific parameters + transformed.size = "1024x1024"; // Default size + transformed.response_format = "url"; // Default format + transformed.quality = "standard"; // Default quality + + // Handle DALL-E 3 style parameter + if (body.style && ["vivid", "natural"].includes(body.style)) { + transformed.style = body.style; + } else { + transformed.style = "vivid"; // Default style + } + + // Handle specific quality settings for dall-e-3 + if (body.quality && ["standard", "hd"].includes(body.quality)) { + transformed.quality = body.quality; + } + + // Handle specific size settings for dall-e-3 + if (body.size && ["1024x1024", "1792x1024", "1024x1792"].includes(body.size)) { + transformed.size = body.size; + } + } else { + // DALL-E 2 specific parameters + transformed.size = "1024x1024"; // Default size + transformed.response_format = "url"; // Default format + + // NO quality parameter for dall-e-2 + // Explicitly remove the quality parameter before sending + delete transformed.quality; + + // Handle specific size settings for dall-e-2 + if (body.size && ["256x256", "512x512", "1024x1024"].includes(body.size)) { + transformed.size = body.size; + } + } + + // Handle common parameters + if (body.n && !isNaN(parseInt(body.n))) { + // For dall-e-3, only n=1 is supported + if (modelName === "dall-e-3" && parseInt(body.n) > 1) { + transformed.n = 1; + } else { + transformed.n = parseInt(body.n); + } + } + + // Handle response_format for non-gpt-image models + if (!isGptImage && body.response_format && ["url", "b64_json"].includes(body.response_format)) { + transformed.response_format = body.response_format; + } + + // If this is gpt-image-1 and we have image content, add it to the transformed request + if (isGptImage && req.multimodalContent?.images && req.multimodalContent.images.length > 0) { + // For the edit endpoint, we need to format the images properly + transformed.image = req.multimodalContent.images.length === 1 + ? req.multimodalContent.images[0] + : req.multimodalContent.images; + + // Any request with images for gpt-image-1 should use the edits endpoint + req.log.info(`${req.multimodalContent.images.length} image(s) detected for gpt-image-1, using images/edits endpoint`); + if (req.path.startsWith("/v1/chat/completions")) { + req.url = req.url.replace("/v1/chat/completions", "/v1/images/edits"); + } + } + // For dall-e-2, we need to make sure we don't introduce unsupported parameters + // due to default values in the schema. Let's bypass Zod schema validation here + // for dall-e-2 and only include the supported parameters. + if (modelName === "dall-e-2") { + // Only include parameters that dall-e-2 supports + const filteredTransformed: any = {}; + + // List of parameters supported by dall-e-2 + const supportedParams = [ + "model", "prompt", "n", "size", "response_format", "user" + ]; + + // Copy only supported parameters + for (const param of supportedParams) { + if (transformed[param] !== undefined) { + filteredTransformed[param] = transformed[param]; + } + } + + // Log what we're sending + req.log.info({ params: Object.keys(filteredTransformed) }, "Filtered parameters for dall-e-2"); + + return filteredTransformed; + } + + // For other models, use the schema as normal return OpenAIV1ImagesGenerationSchema.parse(transformed); }; diff --git a/src/shared/file-storage/mirror-generated-image.ts b/src/shared/file-storage/mirror-generated-image.ts index 978a858..4ee9494 100644 --- a/src/shared/file-storage/mirror-generated-image.ts +++ b/src/shared/file-storage/mirror-generated-image.ts @@ -13,9 +13,19 @@ export type OpenAIImageGenerationResult = { created: number; data: { revised_prompt?: string; - url: string; - b64_json: string; + url?: string; // gpt-image-1 doesn't return URLs, only b64_json + b64_json?: string; }[]; + // Added for gpt-image-1 responses + usage?: { + total_tokens: number; + input_tokens: number; + output_tokens: number; + input_tokens_details?: { + text_tokens: number; + image_tokens: number; + }; + }; }; async function downloadImage(url: string) { @@ -65,11 +75,16 @@ export async function mirrorGeneratedImage( let mirror: string; if (item.b64_json) { mirror = await saveB64Image(item.b64_json); - } else { + } else if (item.url) { mirror = await downloadImage(item.url); + } else { + req.log.warn("No image data found in response"); + continue; } + // Set the URL to our mirrored version item.url = `${host}/user_content/${path.basename(mirror)}`; await createThumbnail(mirror); + // Add to image history with the local URL addToImageHistory({ url: item.url, prompt, diff --git a/src/shared/key-management/azure/provider.ts b/src/shared/key-management/azure/provider.ts index 535acac..7a5a30d 100644 --- a/src/shared/key-management/azure/provider.ts +++ b/src/shared/key-management/azure/provider.ts @@ -84,6 +84,7 @@ export class AzureOpenAIKeyProvider implements KeyProvider { "azure-o3Tokens": 0, "azure-o4-miniTokens": 0, "azure-dall-eTokens": 0, + "azure-gpt-imageTokens": 0, modelIds: [], }; this.keys.push(newKey); diff --git a/src/shared/key-management/openai/provider.ts b/src/shared/key-management/openai/provider.ts index 92fa8b7..24159e7 100644 --- a/src/shared/key-management/openai/provider.ts +++ b/src/shared/key-management/openai/provider.ts @@ -124,6 +124,7 @@ export class OpenAIKeyProvider implements KeyProvider { "o3Tokens": 0, "o4-miniTokens": 0, "dall-eTokens": 0, + "gpt-imageTokens": 0, modelIds: [], }; this.keys.push(newKey); diff --git a/src/shared/models.ts b/src/shared/models.ts index d67085b..4c2fed2 100644 --- a/src/shared/models.ts +++ b/src/shared/models.ts @@ -34,7 +34,8 @@ export type OpenAIModelFamily = | "o3-mini" | "o3" | "o4-mini" - | "dall-e"; + | "dall-e" + | "gpt-image"; export type AnthropicModelFamily = "claude" | "claude-opus"; export type GoogleAIModelFamily = | "gemini-flash" @@ -84,6 +85,7 @@ export const MODEL_FAMILIES = (( "o3", "o4-mini", "dall-e", + "gpt-image", "claude", "claude-opus", "gemini-flash", @@ -117,6 +119,7 @@ export const MODEL_FAMILIES = (( "azure-o3-mini", "azure-o3", "azure-o4-mini", + "azure-gpt-image", ] as const); export const LLM_SERVICES = (( @@ -154,6 +157,7 @@ export const MODEL_FAMILY_SERVICE: { "o3": "openai", "o4-mini": "openai", "dall-e": "openai", + "gpt-image": "openai", claude: "anthropic", "claude-opus": "anthropic", "aws-claude": "aws", @@ -180,6 +184,7 @@ export const MODEL_FAMILY_SERVICE: { "azure-o3-mini": "azure", "azure-o3": "azure", "azure-o4-mini": "azure", + "azure-gpt-image": "azure", "gemini-flash": "google-ai", "gemini-pro": "google-ai", "gemini-ultra": "google-ai", @@ -189,9 +194,10 @@ export const MODEL_FAMILY_SERVICE: { "mistral-large": "mistral-ai", }; -export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e"]; +export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e", "gpt-image", "azure-gpt-image"]; export const OPENAI_MODEL_FAMILY_MAP: { [regex: string]: OpenAIModelFamily } = { + "^gpt-image(-\\d+)?(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt-image", "^gpt-4\\.5(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt45", "^gpt-4\\.1(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41", "^gpt-4\\.1-mini(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41-mini", diff --git a/src/shared/stats.ts b/src/shared/stats.ts index 16c25d1..95912f5 100644 --- a/src/shared/stats.ts +++ b/src/shared/stats.ts @@ -83,6 +83,16 @@ export function getTokenCostUsd(model: ModelFamily, tokens: number) { case "dall-e": cost = 0.00001; break; + case "azure-gpt-image": + case "gpt-image": + // gpt-image-1 pricing: + // Text input tokens: $5 per 1M tokens + // Image input tokens: $10 per 1M tokens + // Image output tokens: $40 per 1M tokens + // Weighted average assuming a mix of text/image input and output + // Typical cost is $0.02-$0.19 per image depending on quality + cost = 0.000018; // Balanced estimate accounting for input/output mix + break; case "aws-claude": case "gcp-claude": case "claude":