prelim gpt-image (cant test, no access)

2025-04-25 10:38:23 +02:00
parent 465b13e5fb
commit a16d66a45b
8 changed files with 374 additions and 37 deletions
@@ -30,6 +30,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = {
  "o3": "OpenAI o3",
  "o4-mini": "OpenAI o4 mini",
  "dall-e": "DALL-E",
+  "gpt-image": "GPT Image",
  claude: "Claude (Sonnet)",
  "claude-opus": "Claude (Opus)",
  "gemini-flash": "Gemini Flash",
@@ -63,6 +64,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = {
  "azure-o3": "Azure o3",
  "azure-o4-mini": "Azure o4 mini",
  "azure-dall-e": "Azure DALL-E",
+  "azure-gpt-image": "Azure GPT Image",
 };

 const converter = new showdown.Converter();
@@ -213,15 +215,15 @@ function getServerTitle() {
 }

 function buildRecentImageSection() {
-  const dalleModels: ModelFamily[] = ["azure-dall-e", "dall-e"];
+  const imageModels: ModelFamily[] = ["azure-dall-e", "dall-e", "gpt-image", "azure-gpt-image"];
  if (
    !config.showRecentImages ||
-    dalleModels.every((f) => !config.allowedModelFamilies.includes(f))
+    imageModels.every((f) => !config.allowedModelFamilies.includes(f))
  ) {
    return "";
  }

-  let html = `<h2>Recent DALL-E Generations</h2>`;
+  let html = `<h2>Recent Image Generations</h2>`;
  const recentImages = getLastNImages(12).reverse();
  if (recentImages.length === 0) {
    html += `<p>No images yet.</p>`;
@@ -11,7 +11,7 @@ import { ProxyResHandlerWithBody } from "./middleware/response";
 import { ProxyReqManager } from "./middleware/request/proxy-req-manager";
 import { createQueuedProxyMiddleware } from "./middleware/request/proxy-middleware-factory";

-const KNOWN_MODELS = ["dall-e-2", "dall-e-3"];
+const KNOWN_MODELS = ["dall-e-2", "dall-e-3", "gpt-image-1"];

 let modelListCache: any = null;
 let modelListValid = 0;
@@ -58,27 +58,46 @@ function transformResponseForChat(
  req: Request
 ): Record<string, any> {
  const prompt = imageBody.data[0].revised_prompt ?? req.body.prompt;
+  const isGptImage = req.body.model?.includes("gpt-image") || false;
+  
  const content = imageBody.data
    .map((item) => {
      const { url, b64_json } = item;
+      // The gpt-image-1 model always returns b64_json
+      // Format will depend on output_format parameter (defaults to png)
+      // For simplicity, we'll assume png if not specified
+      const format = req.body.output_format || "png";
+      
      if (b64_json) {
-        return `![${prompt}](data:image/png;base64,${b64_json})`;
+        return `![${prompt}](data:image/${format};base64,${b64_json})`;
      } else {
        return `![${prompt}](${url})`;
      }
    })
    .join("\n\n");

+  // Prepare the usage information - gpt-image-1 includes detailed token usage
+  let usage = {
+    prompt_tokens: 0,
+    completion_tokens: req.outputTokens,
+    total_tokens: req.outputTokens,
+  };
+  
+  // If this is a gpt-image-1 response, it includes detailed usage info
+  if (imageBody.usage) {
+    usage = {
+      prompt_tokens: imageBody.usage.input_tokens || 0,
+      completion_tokens: imageBody.usage.output_tokens || 0,
+      total_tokens: imageBody.usage.total_tokens || 0,
+    };
+  }
+  
  return {
-    id: "dalle-" + req.id,
+    id: req.body.model?.includes("gpt-image") ? "gptimage-" + req.id : "dalle-" + req.id,
    object: "chat.completion",
    created: Date.now(),
    model: req.body.model,
-    usage: {
-      prompt_tokens: 0,
-      completion_tokens: req.outputTokens,
-      total_tokens: req.outputTokens,
-    },
+    usage,
    choices: [
      {
        message: { role: "assistant", content },
@@ -89,6 +108,56 @@ function transformResponseForChat(
  };
 }

+// Filter parameters based on the model being used to avoid sending unsupported parameters
+function filterModelParameters(manager: ProxyReqManager) {
+  const req = manager.request;
+  const originalBody = req.body;
+  const modelName = originalBody?.model || "";
+  
+  // Skip if no body or it's not an object
+  if (!originalBody || typeof originalBody !== 'object') return;
+  
+  // Create a deep copy of the body to filter
+  const filteredBody = { ...originalBody };
+  
+  // Define allowed parameters for each model
+  if (modelName.includes('dall-e-2')) {
+    // DALL-E 2 parameters
+    const allowedParams = [
+      'model', 'prompt', 'n', 'size', 'response_format', 'user'
+    ];
+    
+    // Remove any parameter not in the allowed list
+    Object.keys(filteredBody).forEach(key => {
+      if (!allowedParams.includes(key)) {
+        delete filteredBody[key];
+      }
+    });
+    
+    req.log.info({ model: 'dall-e-2', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 2");
+  } else if (modelName.includes('dall-e-3')) {
+    // DALL-E 3 parameters
+    const allowedParams = [
+      'model', 'prompt', 'n', 'quality', 'size', 'style', 'response_format', 'user'
+    ];
+    
+    // Remove any parameter not in the allowed list
+    Object.keys(filteredBody).forEach(key => {
+      if (!allowedParams.includes(key)) {
+        delete filteredBody[key];
+      }
+    });
+    
+    req.log.info({ model: 'dall-e-3', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 3");
+  } else if (modelName.includes('gpt-image')) {
+    // For gpt-image-1, we can use all parameters
+    req.log.info({ model: 'gpt-image-1', params: Object.keys(filteredBody) }, "Using all parameters for GPT Image");
+  }
+  
+  // Use the proper method to update the body
+  manager.setBody(filteredBody);
+}
+
 function replacePath(manager: ProxyReqManager) {
  const req = manager.request;
  const pathname = req.url.split("?")[0];
@@ -100,7 +169,7 @@ function replacePath(manager: ProxyReqManager) {

 const openaiImagesProxy = createQueuedProxyMiddleware({
  target: "https://api.openai.com",
-  mutations: [replacePath, addKey, finalizeBody],
+  mutations: [replacePath, filterModelParameters, addKey, finalizeBody],
  blockingResponseHandler: openaiImagesResponseHandler,
 });

@@ -116,6 +185,17 @@ openaiImagesRouter.post(
  }),
  openaiImagesProxy
 );
+// Add support for the /v1/images/edits endpoint (used by gpt-image-1 for image editing)
+openaiImagesRouter.post(
+  "/v1/images/edits",
+  ipLimiter,
+  createPreprocessorMiddleware({
+    inApi: "openai-image",
+    outApi: "openai-image",
+    service: "openai",
+  }),
+  openaiImagesProxy
+);
 openaiImagesRouter.post(
  "/v1/chat/completions",
  ipLimiter,
@@ -1,20 +1,58 @@
 import { z } from "zod";
+import { Request } from "express";
 import { OpenAIV1ChatCompletionSchema } from "./openai";
 import { APIFormatTransformer } from "./index";

+// Extend the Express Request type to include multimodal content
+declare global {
+  namespace Express {
+    interface Request {
+      multimodalContent?: {
+        prompt?: string;
+        images?: string[];
+      };
+    }
+  }
+}
+
 // https://platform.openai.com/docs/api-reference/images/create
 export const OpenAIV1ImagesGenerationSchema = z
  .object({
-    prompt: z.string().max(4000),
+    prompt: z.string().max(32000), // gpt-image-1 supports up to 32000 chars
    model: z.string().max(100).optional(),
-    quality: z.enum(["standard", "hd"]).optional().default("standard"),
-    n: z.number().int().min(1).max(4).optional().default(1),
-    response_format: z.enum(["url", "b64_json"]).optional(),
+    // Support for image inputs (multimodal capability of gpt-image-1)
+    image: z.union([
+      z.string(), // single image (base64 or URL)
+      z.array(z.string()) // array of images
+    ]).optional(),
+    mask: z.string().optional(), // mask image for editing
+    // Different quality options based on model
+    quality: z
+      .union([
+        z.enum(["standard", "hd"]), // dall-e-3 options
+        z.enum(["high", "medium", "low"]), // gpt-image-1 options
+        z.literal("auto") // default for gpt-image-1
+      ])
+      .optional()
+      .default("standard"),
+    n: z.number().int().min(1).max(10).optional().default(1), // gpt-image-1 supports up to 10
+    response_format: z.enum(["url", "b64_json"]).optional(), // Note: gpt-image-1 always returns b64_json
+    // Enhanced size options for gpt-image-1
    size: z
-      .enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"])
+      .union([
+        // dalle models
+        z.enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"]),
+        // gpt-image-1 models (adds landscape, portrait, auto)
+        z.enum(["1024x1024", "1536x1024", "1024x1536", "auto"])
+      ])
      .optional()
      .default("1024x1024"),
-    style: z.enum(["vivid", "natural"]).optional().default("vivid"),
+    style: z.enum(["vivid", "natural"]).optional().default("vivid"), // dall-e-3 only
+    // New gpt-image-1 specific parameters
+    background: z.enum(["transparent", "opaque", "auto"]).optional(), // gpt-image-1 only
+    moderation: z.enum(["low", "auto"]).optional(), // gpt-image-1 only
+    output_compression: z.number().int().min(0).max(100).optional(), // gpt-image-1 only
+    output_format: z.enum(["png", "jpeg", "webp"]).optional(), // gpt-image-1 only
    user: z.string().max(500).optional(),
  })
  .strip();
@@ -34,9 +72,41 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer<
  }

  const { messages } = result.data;
-  const prompt = messages.filter((m) => m.role === "user").pop()?.content;
-  if (Array.isArray(prompt)) {
-    throw new Error("Image generation prompt must be a text message.");
+  const userMessage = messages.filter((m) => m.role === "user").pop();
+  if (!userMessage) {
+    throw new Error("No user message found in the request.");
+  }
+  
+  const content = userMessage.content;
+  
+  // Handle array content (multimodal content with text and images)
+  if (Array.isArray(content)) {
+    const textParts: string[] = [];
+    const imageParts: string[] = [];
+    
+    // Process content parts, extracting text and images
+    content.forEach(part => {
+      if (typeof part === 'string') {
+        textParts.push(part);
+      } else if (part.type === 'image_url') {
+        // Extract image URL or base64 data from the content
+        const imageUrl = typeof part.image_url === 'string' 
+          ? part.image_url 
+          : part.image_url.url;
+        imageParts.push(imageUrl);
+      }
+    });
+    
+    // Join all text parts to form the prompt
+    const prompt = textParts.join('\n');
+    
+    // For gpt-image-1, we'll pass both the text prompt and image(s)
+    req.multimodalContent = {
+      prompt,
+      images: imageParts
+    };
+  } else if (typeof content !== 'string') {
+    throw new Error("Image generation prompt must be a text message or multimodal content.");
  }

  if (body.stream) {
@@ -49,20 +119,172 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer<
  // character name or wrapping the entire thing in quotes. We will look for
  // the index of "Image:" and use everything after that as the prompt.

-  const index = prompt?.toLowerCase().indexOf("image:");
-  if (index === -1 || !prompt) {
-    throw new Error(
-      `Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${prompt}).`
-    );
+  // For multimodal requests (image editing with gpt-image-1), we don't require the "Image:" prefix
+  const isMultimodalRequest = Array.isArray(content) && req.multimodalContent?.images && req.multimodalContent.images.length > 0;
+  
+  // Only enforce the "Image:" prefix for non-multimodal requests
+  if (!isMultimodalRequest && typeof content === 'string') {
+    const textIndex = content.toLowerCase().indexOf("image:");
+    if (textIndex === -1) {
+      throw new Error(
+        `Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${content}).`
+      );
+    }
  }

  // TODO: Add some way to specify parameters via chat message
-  const transformed = {
-    model: body.model.includes("dall-e") ? body.model : "dall-e-3",
-    quality: "standard",
-    size: "1024x1024",
-    response_format: "url",
-    prompt: prompt.slice(index! + 6).trim(),
+  // Determine which model to use (gpt-image-1 or dall-e-3)
+  const isGptImage = body.model?.includes("gpt-image") || false;
+  
+  // Get the correct text prompt either from multimodal content or plain string content
+  let textPrompt: string | undefined;
+  let index = -1;
+  
+  if (Array.isArray(content)) {
+    textPrompt = req.multimodalContent?.prompt;
+  } else if (typeof content === 'string') {
+    index = content.toLowerCase().indexOf("image:");
+    textPrompt = index !== -1 ? content.slice(index + 6).trim() : content;
+  }
+  
+  // Validate that we have a text prompt
+  if (!textPrompt) {
+    throw new Error("No text prompt found in the request.");
+  }
+
+  // Determine the exact model being used
+  let modelName = "dall-e-2"; // Default
+  
+  if (isGptImage) {
+    modelName = "gpt-image-1";
+  } else if (body.model?.includes("dall-e-3")) {
+    modelName = "dall-e-3";
+  } else if (body.model?.includes("dall-e-2")) {
+    modelName = "dall-e-2";
+  } else {
+    // If no specific model requested, default to dall-e-3
+    modelName = "dall-e-3";
+  }
+  
+  // Start with basic parameters common to all models
+  const transformed: any = {
+    model: modelName,
+    prompt: textPrompt,
  };
+
+  // Add model-specific parameters
+  if (modelName === "gpt-image-1") {
+    // GPT Image specific parameters
+    transformed.quality = "auto"; // Default quality for gpt-image-1
+    transformed.size = "1024x1024"; // Default size (square)
+    transformed.moderation = "low"; // Always set moderation to low for gpt-image-1
+    
+    // Optional GPT Image parameters
+    if (body.background) transformed.background = body.background;
+    if (body.output_format) transformed.output_format = body.output_format;
+    if (body.output_compression) transformed.output_compression = body.output_compression;
+    
+    // Handle specific quality settings for gpt-image-1
+    if (body.quality && ["high", "medium", "low", "auto"].includes(body.quality)) {
+      transformed.quality = body.quality;
+    }
+    
+    // Handle specific size settings for gpt-image-1
+    if (body.size && ["1024x1024", "1536x1024", "1024x1536", "auto"].includes(body.size)) {
+      transformed.size = body.size;
+    }
+    
+    // No response_format for gpt-image-1 as it always returns b64_json
+  } else if (modelName === "dall-e-3") {
+    // DALL-E 3 specific parameters
+    transformed.size = "1024x1024"; // Default size
+    transformed.response_format = "url"; // Default format
+    transformed.quality = "standard"; // Default quality
+    
+    // Handle DALL-E 3 style parameter
+    if (body.style && ["vivid", "natural"].includes(body.style)) {
+      transformed.style = body.style;
+    } else {
+      transformed.style = "vivid"; // Default style
+    }
+    
+    // Handle specific quality settings for dall-e-3
+    if (body.quality && ["standard", "hd"].includes(body.quality)) {
+      transformed.quality = body.quality;
+    }
+    
+    // Handle specific size settings for dall-e-3
+    if (body.size && ["1024x1024", "1792x1024", "1024x1792"].includes(body.size)) {
+      transformed.size = body.size;
+    }
+  } else {
+    // DALL-E 2 specific parameters
+    transformed.size = "1024x1024"; // Default size
+    transformed.response_format = "url"; // Default format
+    
+    // NO quality parameter for dall-e-2
+    // Explicitly remove the quality parameter before sending
+    delete transformed.quality;
+    
+    // Handle specific size settings for dall-e-2
+    if (body.size && ["256x256", "512x512", "1024x1024"].includes(body.size)) {
+      transformed.size = body.size;
+    }
+  }
+  
+  // Handle common parameters
+  if (body.n && !isNaN(parseInt(body.n))) {
+    // For dall-e-3, only n=1 is supported
+    if (modelName === "dall-e-3" && parseInt(body.n) > 1) {
+      transformed.n = 1;
+    } else {
+      transformed.n = parseInt(body.n);
+    }
+  }
+  
+  // Handle response_format for non-gpt-image models
+  if (!isGptImage && body.response_format && ["url", "b64_json"].includes(body.response_format)) {
+    transformed.response_format = body.response_format;
+  }
+  
+  // If this is gpt-image-1 and we have image content, add it to the transformed request
+  if (isGptImage && req.multimodalContent?.images && req.multimodalContent.images.length > 0) {
+    // For the edit endpoint, we need to format the images properly
+    transformed.image = req.multimodalContent.images.length === 1 
+      ? req.multimodalContent.images[0] 
+      : req.multimodalContent.images;
+    
+    // Any request with images for gpt-image-1 should use the edits endpoint
+    req.log.info(`${req.multimodalContent.images.length} image(s) detected for gpt-image-1, using images/edits endpoint`);
+    if (req.path.startsWith("/v1/chat/completions")) {
+      req.url = req.url.replace("/v1/chat/completions", "/v1/images/edits");
+    }
+  }
+  // For dall-e-2, we need to make sure we don't introduce unsupported parameters
+  // due to default values in the schema. Let's bypass Zod schema validation here
+  // for dall-e-2 and only include the supported parameters.
+  if (modelName === "dall-e-2") {
+    // Only include parameters that dall-e-2 supports
+    const filteredTransformed: any = {};
+    
+    // List of parameters supported by dall-e-2
+    const supportedParams = [
+      "model", "prompt", "n", "size", "response_format", "user"
+    ];
+    
+    // Copy only supported parameters
+    for (const param of supportedParams) {
+      if (transformed[param] !== undefined) {
+        filteredTransformed[param] = transformed[param];
+      }
+    }
+    
+    // Log what we're sending
+    req.log.info({ params: Object.keys(filteredTransformed) }, "Filtered parameters for dall-e-2");
+    
+    return filteredTransformed;
+  }
+  
+  // For other models, use the schema as normal
  return OpenAIV1ImagesGenerationSchema.parse(transformed);
 };
@@ -13,9 +13,19 @@ export type OpenAIImageGenerationResult = {
  created: number;
  data: {
    revised_prompt?: string;
-    url: string;
-    b64_json: string;
+    url?: string; // gpt-image-1 doesn't return URLs, only b64_json
+    b64_json?: string;
  }[];
+  // Added for gpt-image-1 responses
+  usage?: {
+    total_tokens: number;
+    input_tokens: number;
+    output_tokens: number;
+    input_tokens_details?: {
+      text_tokens: number;
+      image_tokens: number;
+    };
+  };
 };

 async function downloadImage(url: string) {
@@ -65,11 +75,16 @@ export async function mirrorGeneratedImage(
    let mirror: string;
    if (item.b64_json) {
      mirror = await saveB64Image(item.b64_json);
-    } else {
+    } else if (item.url) {
      mirror = await downloadImage(item.url);
+    } else {
+      req.log.warn("No image data found in response");
+      continue;
    }
+    // Set the URL to our mirrored version
    item.url = `${host}/user_content/${path.basename(mirror)}`;
    await createThumbnail(mirror);
+    // Add to image history with the local URL
    addToImageHistory({
      url: item.url,
      prompt,
@@ -84,6 +84,7 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {
        "azure-o3Tokens": 0,
        "azure-o4-miniTokens": 0,
        "azure-dall-eTokens": 0,
+        "azure-gpt-imageTokens": 0,
        modelIds: [],
      };
      this.keys.push(newKey);
@@ -124,6 +124,7 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
        "o3Tokens": 0,
        "o4-miniTokens": 0,
        "dall-eTokens": 0,
+        "gpt-imageTokens": 0,
        modelIds: [],
      };
      this.keys.push(newKey);
@@ -34,7 +34,8 @@ export type OpenAIModelFamily =
  | "o3-mini"
  | "o3"
  | "o4-mini"
-  | "dall-e";
+  | "dall-e"
+  | "gpt-image";
 export type AnthropicModelFamily = "claude" | "claude-opus";
 export type GoogleAIModelFamily =
  | "gemini-flash"
@@ -84,6 +85,7 @@ export const MODEL_FAMILIES = (<A extends readonly ModelFamily[]>(
  "o3",
  "o4-mini",
  "dall-e",
+  "gpt-image",
  "claude",
  "claude-opus",
  "gemini-flash",
@@ -117,6 +119,7 @@ export const MODEL_FAMILIES = (<A extends readonly ModelFamily[]>(
  "azure-o3-mini",
  "azure-o3",
  "azure-o4-mini",
+  "azure-gpt-image",
 ] as const);

 export const LLM_SERVICES = (<A extends readonly LLMService[]>(
@@ -154,6 +157,7 @@ export const MODEL_FAMILY_SERVICE: {
  "o3": "openai",
  "o4-mini": "openai",
  "dall-e": "openai",
+  "gpt-image": "openai",
  claude: "anthropic",
  "claude-opus": "anthropic",
  "aws-claude": "aws",
@@ -180,6 +184,7 @@ export const MODEL_FAMILY_SERVICE: {
  "azure-o3-mini": "azure",
  "azure-o3": "azure",
  "azure-o4-mini": "azure",
+  "azure-gpt-image": "azure",
  "gemini-flash": "google-ai",
  "gemini-pro": "google-ai",
  "gemini-ultra": "google-ai",
@@ -189,9 +194,10 @@ export const MODEL_FAMILY_SERVICE: {
  "mistral-large": "mistral-ai",
 };

-export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e"];
+export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e", "gpt-image", "azure-gpt-image"];

 export const OPENAI_MODEL_FAMILY_MAP: { [regex: string]: OpenAIModelFamily } = {
+  "^gpt-image(-\\d+)?(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt-image",
  "^gpt-4\\.5(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt45",
  "^gpt-4\\.1(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41",
  "^gpt-4\\.1-mini(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41-mini",
@@ -83,6 +83,16 @@ export function getTokenCostUsd(model: ModelFamily, tokens: number) {
    case "dall-e":
      cost = 0.00001;
      break;
+    case "azure-gpt-image":
+    case "gpt-image":
+      // gpt-image-1 pricing:
+      // Text input tokens: $5 per 1M tokens
+      // Image input tokens: $10 per 1M tokens
+      // Image output tokens: $40 per 1M tokens
+      // Weighted average assuming a mix of text/image input and output
+      // Typical cost is $0.02-$0.19 per image depending on quality
+      cost = 0.000018; // Balanced estimate accounting for input/output mix
+      break;
    case "aws-claude":
    case "gcp-claude":
    case "claude":