diff --git a/src/info-page.ts b/src/info-page.ts
index dbe5838..cb5a3b7 100644
--- a/src/info-page.ts
+++ b/src/info-page.ts
@@ -30,6 +30,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = {
"o3": "OpenAI o3",
"o4-mini": "OpenAI o4 mini",
"dall-e": "DALL-E",
+ "gpt-image": "GPT Image",
claude: "Claude (Sonnet)",
"claude-opus": "Claude (Opus)",
"gemini-flash": "Gemini Flash",
@@ -63,6 +64,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = {
"azure-o3": "Azure o3",
"azure-o4-mini": "Azure o4 mini",
"azure-dall-e": "Azure DALL-E",
+ "azure-gpt-image": "Azure GPT Image",
};
const converter = new showdown.Converter();
@@ -213,15 +215,15 @@ function getServerTitle() {
}
function buildRecentImageSection() {
- const dalleModels: ModelFamily[] = ["azure-dall-e", "dall-e"];
+ const imageModels: ModelFamily[] = ["azure-dall-e", "dall-e", "gpt-image", "azure-gpt-image"];
if (
!config.showRecentImages ||
- dalleModels.every((f) => !config.allowedModelFamilies.includes(f))
+ imageModels.every((f) => !config.allowedModelFamilies.includes(f))
) {
return "";
}
- let html = `
Recent DALL-E Generations
`;
+ let html = `Recent Image Generations
`;
const recentImages = getLastNImages(12).reverse();
if (recentImages.length === 0) {
html += `No images yet.
`;
diff --git a/src/proxy/openai-image.ts b/src/proxy/openai-image.ts
index 0ec7391..2813174 100644
--- a/src/proxy/openai-image.ts
+++ b/src/proxy/openai-image.ts
@@ -11,7 +11,7 @@ import { ProxyResHandlerWithBody } from "./middleware/response";
import { ProxyReqManager } from "./middleware/request/proxy-req-manager";
import { createQueuedProxyMiddleware } from "./middleware/request/proxy-middleware-factory";
-const KNOWN_MODELS = ["dall-e-2", "dall-e-3"];
+const KNOWN_MODELS = ["dall-e-2", "dall-e-3", "gpt-image-1"];
let modelListCache: any = null;
let modelListValid = 0;
@@ -58,27 +58,46 @@ function transformResponseForChat(
req: Request
): Record {
const prompt = imageBody.data[0].revised_prompt ?? req.body.prompt;
+ const isGptImage = req.body.model?.includes("gpt-image") || false;
+
const content = imageBody.data
.map((item) => {
const { url, b64_json } = item;
+ // The gpt-image-1 model always returns b64_json
+ // Format will depend on output_format parameter (defaults to png)
+ // For simplicity, we'll assume png if not specified
+ const format = req.body.output_format || "png";
+
if (b64_json) {
- return ``;
+ return ``;
} else {
return ``;
}
})
.join("\n\n");
+ // Prepare the usage information - gpt-image-1 includes detailed token usage
+ let usage = {
+ prompt_tokens: 0,
+ completion_tokens: req.outputTokens,
+ total_tokens: req.outputTokens,
+ };
+
+ // If this is a gpt-image-1 response, it includes detailed usage info
+ if (imageBody.usage) {
+ usage = {
+ prompt_tokens: imageBody.usage.input_tokens || 0,
+ completion_tokens: imageBody.usage.output_tokens || 0,
+ total_tokens: imageBody.usage.total_tokens || 0,
+ };
+ }
+
return {
- id: "dalle-" + req.id,
+ id: req.body.model?.includes("gpt-image") ? "gptimage-" + req.id : "dalle-" + req.id,
object: "chat.completion",
created: Date.now(),
model: req.body.model,
- usage: {
- prompt_tokens: 0,
- completion_tokens: req.outputTokens,
- total_tokens: req.outputTokens,
- },
+ usage,
choices: [
{
message: { role: "assistant", content },
@@ -89,6 +108,56 @@ function transformResponseForChat(
};
}
+// Filter parameters based on the model being used to avoid sending unsupported parameters
+function filterModelParameters(manager: ProxyReqManager) {
+ const req = manager.request;
+ const originalBody = req.body;
+ const modelName = originalBody?.model || "";
+
+ // Skip if no body or it's not an object
+ if (!originalBody || typeof originalBody !== 'object') return;
+
+ // Create a deep copy of the body to filter
+ const filteredBody = { ...originalBody };
+
+ // Define allowed parameters for each model
+ if (modelName.includes('dall-e-2')) {
+ // DALL-E 2 parameters
+ const allowedParams = [
+ 'model', 'prompt', 'n', 'size', 'response_format', 'user'
+ ];
+
+ // Remove any parameter not in the allowed list
+ Object.keys(filteredBody).forEach(key => {
+ if (!allowedParams.includes(key)) {
+ delete filteredBody[key];
+ }
+ });
+
+ req.log.info({ model: 'dall-e-2', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 2");
+ } else if (modelName.includes('dall-e-3')) {
+ // DALL-E 3 parameters
+ const allowedParams = [
+ 'model', 'prompt', 'n', 'quality', 'size', 'style', 'response_format', 'user'
+ ];
+
+ // Remove any parameter not in the allowed list
+ Object.keys(filteredBody).forEach(key => {
+ if (!allowedParams.includes(key)) {
+ delete filteredBody[key];
+ }
+ });
+
+ req.log.info({ model: 'dall-e-3', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 3");
+ } else if (modelName.includes('gpt-image')) {
+ // For gpt-image-1, we can use all parameters
+ req.log.info({ model: 'gpt-image-1', params: Object.keys(filteredBody) }, "Using all parameters for GPT Image");
+ }
+
+ // Use the proper method to update the body
+ manager.setBody(filteredBody);
+}
+
function replacePath(manager: ProxyReqManager) {
const req = manager.request;
const pathname = req.url.split("?")[0];
@@ -100,7 +169,7 @@ function replacePath(manager: ProxyReqManager) {
const openaiImagesProxy = createQueuedProxyMiddleware({
target: "https://api.openai.com",
- mutations: [replacePath, addKey, finalizeBody],
+ mutations: [replacePath, filterModelParameters, addKey, finalizeBody],
blockingResponseHandler: openaiImagesResponseHandler,
});
@@ -116,6 +185,17 @@ openaiImagesRouter.post(
}),
openaiImagesProxy
);
+// Add support for the /v1/images/edits endpoint (used by gpt-image-1 for image editing)
+openaiImagesRouter.post(
+ "/v1/images/edits",
+ ipLimiter,
+ createPreprocessorMiddleware({
+ inApi: "openai-image",
+ outApi: "openai-image",
+ service: "openai",
+ }),
+ openaiImagesProxy
+);
openaiImagesRouter.post(
"/v1/chat/completions",
ipLimiter,
diff --git a/src/shared/api-schemas/openai-image.ts b/src/shared/api-schemas/openai-image.ts
index 7133362..bab379e 100644
--- a/src/shared/api-schemas/openai-image.ts
+++ b/src/shared/api-schemas/openai-image.ts
@@ -1,20 +1,58 @@
import { z } from "zod";
+import { Request } from "express";
import { OpenAIV1ChatCompletionSchema } from "./openai";
import { APIFormatTransformer } from "./index";
+// Extend the Express Request type to include multimodal content
+declare global {
+ namespace Express {
+ interface Request {
+ multimodalContent?: {
+ prompt?: string;
+ images?: string[];
+ };
+ }
+ }
+}
+
// https://platform.openai.com/docs/api-reference/images/create
export const OpenAIV1ImagesGenerationSchema = z
.object({
- prompt: z.string().max(4000),
+ prompt: z.string().max(32000), // gpt-image-1 supports up to 32000 chars
model: z.string().max(100).optional(),
- quality: z.enum(["standard", "hd"]).optional().default("standard"),
- n: z.number().int().min(1).max(4).optional().default(1),
- response_format: z.enum(["url", "b64_json"]).optional(),
+ // Support for image inputs (multimodal capability of gpt-image-1)
+ image: z.union([
+ z.string(), // single image (base64 or URL)
+ z.array(z.string()) // array of images
+ ]).optional(),
+ mask: z.string().optional(), // mask image for editing
+ // Different quality options based on model
+ quality: z
+ .union([
+ z.enum(["standard", "hd"]), // dall-e-3 options
+ z.enum(["high", "medium", "low"]), // gpt-image-1 options
+ z.literal("auto") // default for gpt-image-1
+ ])
+ .optional()
+ .default("standard"),
+ n: z.number().int().min(1).max(10).optional().default(1), // gpt-image-1 supports up to 10
+ response_format: z.enum(["url", "b64_json"]).optional(), // Note: gpt-image-1 always returns b64_json
+ // Enhanced size options for gpt-image-1
size: z
- .enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"])
+ .union([
+ // dalle models
+ z.enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"]),
+ // gpt-image-1 models (adds landscape, portrait, auto)
+ z.enum(["1024x1024", "1536x1024", "1024x1536", "auto"])
+ ])
.optional()
.default("1024x1024"),
- style: z.enum(["vivid", "natural"]).optional().default("vivid"),
+ style: z.enum(["vivid", "natural"]).optional().default("vivid"), // dall-e-3 only
+ // New gpt-image-1 specific parameters
+ background: z.enum(["transparent", "opaque", "auto"]).optional(), // gpt-image-1 only
+ moderation: z.enum(["low", "auto"]).optional(), // gpt-image-1 only
+ output_compression: z.number().int().min(0).max(100).optional(), // gpt-image-1 only
+ output_format: z.enum(["png", "jpeg", "webp"]).optional(), // gpt-image-1 only
user: z.string().max(500).optional(),
})
.strip();
@@ -34,9 +72,41 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer<
}
const { messages } = result.data;
- const prompt = messages.filter((m) => m.role === "user").pop()?.content;
- if (Array.isArray(prompt)) {
- throw new Error("Image generation prompt must be a text message.");
+ const userMessage = messages.filter((m) => m.role === "user").pop();
+ if (!userMessage) {
+ throw new Error("No user message found in the request.");
+ }
+
+ const content = userMessage.content;
+
+ // Handle array content (multimodal content with text and images)
+ if (Array.isArray(content)) {
+ const textParts: string[] = [];
+ const imageParts: string[] = [];
+
+ // Process content parts, extracting text and images
+ content.forEach(part => {
+ if (typeof part === 'string') {
+ textParts.push(part);
+ } else if (part.type === 'image_url') {
+ // Extract image URL or base64 data from the content
+ const imageUrl = typeof part.image_url === 'string'
+ ? part.image_url
+ : part.image_url.url;
+ imageParts.push(imageUrl);
+ }
+ });
+
+ // Join all text parts to form the prompt
+ const prompt = textParts.join('\n');
+
+ // For gpt-image-1, we'll pass both the text prompt and image(s)
+ req.multimodalContent = {
+ prompt,
+ images: imageParts
+ };
+ } else if (typeof content !== 'string') {
+ throw new Error("Image generation prompt must be a text message or multimodal content.");
}
if (body.stream) {
@@ -49,20 +119,172 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer<
// character name or wrapping the entire thing in quotes. We will look for
// the index of "Image:" and use everything after that as the prompt.
- const index = prompt?.toLowerCase().indexOf("image:");
- if (index === -1 || !prompt) {
- throw new Error(
- `Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${prompt}).`
- );
+ // For multimodal requests (image editing with gpt-image-1), we don't require the "Image:" prefix
+ const isMultimodalRequest = Array.isArray(content) && req.multimodalContent?.images && req.multimodalContent.images.length > 0;
+
+ // Only enforce the "Image:" prefix for non-multimodal requests
+ if (!isMultimodalRequest && typeof content === 'string') {
+ const textIndex = content.toLowerCase().indexOf("image:");
+ if (textIndex === -1) {
+ throw new Error(
+ `Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${content}).`
+ );
+ }
}
// TODO: Add some way to specify parameters via chat message
- const transformed = {
- model: body.model.includes("dall-e") ? body.model : "dall-e-3",
- quality: "standard",
- size: "1024x1024",
- response_format: "url",
- prompt: prompt.slice(index! + 6).trim(),
+ // Determine which model to use (gpt-image-1 or dall-e-3)
+ const isGptImage = body.model?.includes("gpt-image") || false;
+
+ // Get the correct text prompt either from multimodal content or plain string content
+ let textPrompt: string | undefined;
+ let index = -1;
+
+ if (Array.isArray(content)) {
+ textPrompt = req.multimodalContent?.prompt;
+ } else if (typeof content === 'string') {
+ index = content.toLowerCase().indexOf("image:");
+ textPrompt = index !== -1 ? content.slice(index + 6).trim() : content;
+ }
+
+ // Validate that we have a text prompt
+ if (!textPrompt) {
+ throw new Error("No text prompt found in the request.");
+ }
+
+ // Determine the exact model being used
+ let modelName = "dall-e-2"; // Default
+
+ if (isGptImage) {
+ modelName = "gpt-image-1";
+ } else if (body.model?.includes("dall-e-3")) {
+ modelName = "dall-e-3";
+ } else if (body.model?.includes("dall-e-2")) {
+ modelName = "dall-e-2";
+ } else {
+ // If no specific model requested, default to dall-e-3
+ modelName = "dall-e-3";
+ }
+
+ // Start with basic parameters common to all models
+ const transformed: any = {
+ model: modelName,
+ prompt: textPrompt,
};
+
+ // Add model-specific parameters
+ if (modelName === "gpt-image-1") {
+ // GPT Image specific parameters
+ transformed.quality = "auto"; // Default quality for gpt-image-1
+ transformed.size = "1024x1024"; // Default size (square)
+ transformed.moderation = "low"; // Always set moderation to low for gpt-image-1
+
+ // Optional GPT Image parameters
+ if (body.background) transformed.background = body.background;
+ if (body.output_format) transformed.output_format = body.output_format;
+ if (body.output_compression) transformed.output_compression = body.output_compression;
+
+ // Handle specific quality settings for gpt-image-1
+ if (body.quality && ["high", "medium", "low", "auto"].includes(body.quality)) {
+ transformed.quality = body.quality;
+ }
+
+ // Handle specific size settings for gpt-image-1
+ if (body.size && ["1024x1024", "1536x1024", "1024x1536", "auto"].includes(body.size)) {
+ transformed.size = body.size;
+ }
+
+ // No response_format for gpt-image-1 as it always returns b64_json
+ } else if (modelName === "dall-e-3") {
+ // DALL-E 3 specific parameters
+ transformed.size = "1024x1024"; // Default size
+ transformed.response_format = "url"; // Default format
+ transformed.quality = "standard"; // Default quality
+
+ // Handle DALL-E 3 style parameter
+ if (body.style && ["vivid", "natural"].includes(body.style)) {
+ transformed.style = body.style;
+ } else {
+ transformed.style = "vivid"; // Default style
+ }
+
+ // Handle specific quality settings for dall-e-3
+ if (body.quality && ["standard", "hd"].includes(body.quality)) {
+ transformed.quality = body.quality;
+ }
+
+ // Handle specific size settings for dall-e-3
+ if (body.size && ["1024x1024", "1792x1024", "1024x1792"].includes(body.size)) {
+ transformed.size = body.size;
+ }
+ } else {
+ // DALL-E 2 specific parameters
+ transformed.size = "1024x1024"; // Default size
+ transformed.response_format = "url"; // Default format
+
+ // NO quality parameter for dall-e-2
+ // Explicitly remove the quality parameter before sending
+ delete transformed.quality;
+
+ // Handle specific size settings for dall-e-2
+ if (body.size && ["256x256", "512x512", "1024x1024"].includes(body.size)) {
+ transformed.size = body.size;
+ }
+ }
+
+ // Handle common parameters
+ if (body.n && !isNaN(parseInt(body.n))) {
+ // For dall-e-3, only n=1 is supported
+ if (modelName === "dall-e-3" && parseInt(body.n) > 1) {
+ transformed.n = 1;
+ } else {
+ transformed.n = parseInt(body.n);
+ }
+ }
+
+ // Handle response_format for non-gpt-image models
+ if (!isGptImage && body.response_format && ["url", "b64_json"].includes(body.response_format)) {
+ transformed.response_format = body.response_format;
+ }
+
+ // If this is gpt-image-1 and we have image content, add it to the transformed request
+ if (isGptImage && req.multimodalContent?.images && req.multimodalContent.images.length > 0) {
+ // For the edit endpoint, we need to format the images properly
+ transformed.image = req.multimodalContent.images.length === 1
+ ? req.multimodalContent.images[0]
+ : req.multimodalContent.images;
+
+ // Any request with images for gpt-image-1 should use the edits endpoint
+ req.log.info(`${req.multimodalContent.images.length} image(s) detected for gpt-image-1, using images/edits endpoint`);
+ if (req.path.startsWith("/v1/chat/completions")) {
+ req.url = req.url.replace("/v1/chat/completions", "/v1/images/edits");
+ }
+ }
+ // For dall-e-2, we need to make sure we don't introduce unsupported parameters
+ // due to default values in the schema. Let's bypass Zod schema validation here
+ // for dall-e-2 and only include the supported parameters.
+ if (modelName === "dall-e-2") {
+ // Only include parameters that dall-e-2 supports
+ const filteredTransformed: any = {};
+
+ // List of parameters supported by dall-e-2
+ const supportedParams = [
+ "model", "prompt", "n", "size", "response_format", "user"
+ ];
+
+ // Copy only supported parameters
+ for (const param of supportedParams) {
+ if (transformed[param] !== undefined) {
+ filteredTransformed[param] = transformed[param];
+ }
+ }
+
+ // Log what we're sending
+ req.log.info({ params: Object.keys(filteredTransformed) }, "Filtered parameters for dall-e-2");
+
+ return filteredTransformed;
+ }
+
+ // For other models, use the schema as normal
return OpenAIV1ImagesGenerationSchema.parse(transformed);
};
diff --git a/src/shared/file-storage/mirror-generated-image.ts b/src/shared/file-storage/mirror-generated-image.ts
index 978a858..4ee9494 100644
--- a/src/shared/file-storage/mirror-generated-image.ts
+++ b/src/shared/file-storage/mirror-generated-image.ts
@@ -13,9 +13,19 @@ export type OpenAIImageGenerationResult = {
created: number;
data: {
revised_prompt?: string;
- url: string;
- b64_json: string;
+ url?: string; // gpt-image-1 doesn't return URLs, only b64_json
+ b64_json?: string;
}[];
+ // Added for gpt-image-1 responses
+ usage?: {
+ total_tokens: number;
+ input_tokens: number;
+ output_tokens: number;
+ input_tokens_details?: {
+ text_tokens: number;
+ image_tokens: number;
+ };
+ };
};
async function downloadImage(url: string) {
@@ -65,11 +75,16 @@ export async function mirrorGeneratedImage(
let mirror: string;
if (item.b64_json) {
mirror = await saveB64Image(item.b64_json);
- } else {
+ } else if (item.url) {
mirror = await downloadImage(item.url);
+ } else {
+ req.log.warn("No image data found in response");
+ continue;
}
+ // Set the URL to our mirrored version
item.url = `${host}/user_content/${path.basename(mirror)}`;
await createThumbnail(mirror);
+ // Add to image history with the local URL
addToImageHistory({
url: item.url,
prompt,
diff --git a/src/shared/key-management/azure/provider.ts b/src/shared/key-management/azure/provider.ts
index 535acac..7a5a30d 100644
--- a/src/shared/key-management/azure/provider.ts
+++ b/src/shared/key-management/azure/provider.ts
@@ -84,6 +84,7 @@ export class AzureOpenAIKeyProvider implements KeyProvider {
"azure-o3Tokens": 0,
"azure-o4-miniTokens": 0,
"azure-dall-eTokens": 0,
+ "azure-gpt-imageTokens": 0,
modelIds: [],
};
this.keys.push(newKey);
diff --git a/src/shared/key-management/openai/provider.ts b/src/shared/key-management/openai/provider.ts
index 92fa8b7..24159e7 100644
--- a/src/shared/key-management/openai/provider.ts
+++ b/src/shared/key-management/openai/provider.ts
@@ -124,6 +124,7 @@ export class OpenAIKeyProvider implements KeyProvider {
"o3Tokens": 0,
"o4-miniTokens": 0,
"dall-eTokens": 0,
+ "gpt-imageTokens": 0,
modelIds: [],
};
this.keys.push(newKey);
diff --git a/src/shared/models.ts b/src/shared/models.ts
index d67085b..4c2fed2 100644
--- a/src/shared/models.ts
+++ b/src/shared/models.ts
@@ -34,7 +34,8 @@ export type OpenAIModelFamily =
| "o3-mini"
| "o3"
| "o4-mini"
- | "dall-e";
+ | "dall-e"
+ | "gpt-image";
export type AnthropicModelFamily = "claude" | "claude-opus";
export type GoogleAIModelFamily =
| "gemini-flash"
@@ -84,6 +85,7 @@ export const MODEL_FAMILIES = ((
"o3",
"o4-mini",
"dall-e",
+ "gpt-image",
"claude",
"claude-opus",
"gemini-flash",
@@ -117,6 +119,7 @@ export const MODEL_FAMILIES = ((
"azure-o3-mini",
"azure-o3",
"azure-o4-mini",
+ "azure-gpt-image",
] as const);
export const LLM_SERVICES = ((
@@ -154,6 +157,7 @@ export const MODEL_FAMILY_SERVICE: {
"o3": "openai",
"o4-mini": "openai",
"dall-e": "openai",
+ "gpt-image": "openai",
claude: "anthropic",
"claude-opus": "anthropic",
"aws-claude": "aws",
@@ -180,6 +184,7 @@ export const MODEL_FAMILY_SERVICE: {
"azure-o3-mini": "azure",
"azure-o3": "azure",
"azure-o4-mini": "azure",
+ "azure-gpt-image": "azure",
"gemini-flash": "google-ai",
"gemini-pro": "google-ai",
"gemini-ultra": "google-ai",
@@ -189,9 +194,10 @@ export const MODEL_FAMILY_SERVICE: {
"mistral-large": "mistral-ai",
};
-export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e"];
+export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e", "gpt-image", "azure-gpt-image"];
export const OPENAI_MODEL_FAMILY_MAP: { [regex: string]: OpenAIModelFamily } = {
+ "^gpt-image(-\\d+)?(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt-image",
"^gpt-4\\.5(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt45",
"^gpt-4\\.1(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41",
"^gpt-4\\.1-mini(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41-mini",
diff --git a/src/shared/stats.ts b/src/shared/stats.ts
index 16c25d1..95912f5 100644
--- a/src/shared/stats.ts
+++ b/src/shared/stats.ts
@@ -83,6 +83,16 @@ export function getTokenCostUsd(model: ModelFamily, tokens: number) {
case "dall-e":
cost = 0.00001;
break;
+ case "azure-gpt-image":
+ case "gpt-image":
+ // gpt-image-1 pricing:
+ // Text input tokens: $5 per 1M tokens
+ // Image input tokens: $10 per 1M tokens
+ // Image output tokens: $40 per 1M tokens
+ // Weighted average assuming a mix of text/image input and output
+ // Typical cost is $0.02-$0.19 per image depending on quality
+ cost = 0.000018; // Balanced estimate accounting for input/output mix
+ break;
case "aws-claude":
case "gcp-claude":
case "claude":