prelim gpt-image (cant test, no access)

This commit is contained in:
reanon
2025-04-25 10:38:23 +02:00
parent 465b13e5fb
commit a16d66a45b
8 changed files with 374 additions and 37 deletions
+5 -3
View File
@@ -30,6 +30,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = {
"o3": "OpenAI o3",
"o4-mini": "OpenAI o4 mini",
"dall-e": "DALL-E",
"gpt-image": "GPT Image",
claude: "Claude (Sonnet)",
"claude-opus": "Claude (Opus)",
"gemini-flash": "Gemini Flash",
@@ -63,6 +64,7 @@ const MODEL_FAMILY_FRIENDLY_NAME: { [f in ModelFamily]: string } = {
"azure-o3": "Azure o3",
"azure-o4-mini": "Azure o4 mini",
"azure-dall-e": "Azure DALL-E",
"azure-gpt-image": "Azure GPT Image",
};
const converter = new showdown.Converter();
@@ -213,15 +215,15 @@ function getServerTitle() {
}
function buildRecentImageSection() {
const dalleModels: ModelFamily[] = ["azure-dall-e", "dall-e"];
const imageModels: ModelFamily[] = ["azure-dall-e", "dall-e", "gpt-image", "azure-gpt-image"];
if (
!config.showRecentImages ||
dalleModels.every((f) => !config.allowedModelFamilies.includes(f))
imageModels.every((f) => !config.allowedModelFamilies.includes(f))
) {
return "";
}
let html = `<h2>Recent DALL-E Generations</h2>`;
let html = `<h2>Recent Image Generations</h2>`;
const recentImages = getLastNImages(12).reverse();
if (recentImages.length === 0) {
html += `<p>No images yet.</p>`;
+89 -9
View File
@@ -11,7 +11,7 @@ import { ProxyResHandlerWithBody } from "./middleware/response";
import { ProxyReqManager } from "./middleware/request/proxy-req-manager";
import { createQueuedProxyMiddleware } from "./middleware/request/proxy-middleware-factory";
const KNOWN_MODELS = ["dall-e-2", "dall-e-3"];
const KNOWN_MODELS = ["dall-e-2", "dall-e-3", "gpt-image-1"];
let modelListCache: any = null;
let modelListValid = 0;
@@ -58,27 +58,46 @@ function transformResponseForChat(
req: Request
): Record<string, any> {
const prompt = imageBody.data[0].revised_prompt ?? req.body.prompt;
const isGptImage = req.body.model?.includes("gpt-image") || false;
const content = imageBody.data
.map((item) => {
const { url, b64_json } = item;
// The gpt-image-1 model always returns b64_json
// Format will depend on output_format parameter (defaults to png)
// For simplicity, we'll assume png if not specified
const format = req.body.output_format || "png";
if (b64_json) {
return `![${prompt}](data:image/png;base64,${b64_json})`;
return `![${prompt}](data:image/${format};base64,${b64_json})`;
} else {
return `![${prompt}](${url})`;
}
})
.join("\n\n");
// Prepare the usage information - gpt-image-1 includes detailed token usage
let usage = {
prompt_tokens: 0,
completion_tokens: req.outputTokens,
total_tokens: req.outputTokens,
};
// If this is a gpt-image-1 response, it includes detailed usage info
if (imageBody.usage) {
usage = {
prompt_tokens: imageBody.usage.input_tokens || 0,
completion_tokens: imageBody.usage.output_tokens || 0,
total_tokens: imageBody.usage.total_tokens || 0,
};
}
return {
id: "dalle-" + req.id,
id: req.body.model?.includes("gpt-image") ? "gptimage-" + req.id : "dalle-" + req.id,
object: "chat.completion",
created: Date.now(),
model: req.body.model,
usage: {
prompt_tokens: 0,
completion_tokens: req.outputTokens,
total_tokens: req.outputTokens,
},
usage,
choices: [
{
message: { role: "assistant", content },
@@ -89,6 +108,56 @@ function transformResponseForChat(
};
}
// Filter parameters based on the model being used to avoid sending unsupported parameters
function filterModelParameters(manager: ProxyReqManager) {
const req = manager.request;
const originalBody = req.body;
const modelName = originalBody?.model || "";
// Skip if no body or it's not an object
if (!originalBody || typeof originalBody !== 'object') return;
// Create a deep copy of the body to filter
const filteredBody = { ...originalBody };
// Define allowed parameters for each model
if (modelName.includes('dall-e-2')) {
// DALL-E 2 parameters
const allowedParams = [
'model', 'prompt', 'n', 'size', 'response_format', 'user'
];
// Remove any parameter not in the allowed list
Object.keys(filteredBody).forEach(key => {
if (!allowedParams.includes(key)) {
delete filteredBody[key];
}
});
req.log.info({ model: 'dall-e-2', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 2");
} else if (modelName.includes('dall-e-3')) {
// DALL-E 3 parameters
const allowedParams = [
'model', 'prompt', 'n', 'quality', 'size', 'style', 'response_format', 'user'
];
// Remove any parameter not in the allowed list
Object.keys(filteredBody).forEach(key => {
if (!allowedParams.includes(key)) {
delete filteredBody[key];
}
});
req.log.info({ model: 'dall-e-3', params: Object.keys(filteredBody) }, "Filtered parameters for DALL-E 3");
} else if (modelName.includes('gpt-image')) {
// For gpt-image-1, we can use all parameters
req.log.info({ model: 'gpt-image-1', params: Object.keys(filteredBody) }, "Using all parameters for GPT Image");
}
// Use the proper method to update the body
manager.setBody(filteredBody);
}
function replacePath(manager: ProxyReqManager) {
const req = manager.request;
const pathname = req.url.split("?")[0];
@@ -100,7 +169,7 @@ function replacePath(manager: ProxyReqManager) {
const openaiImagesProxy = createQueuedProxyMiddleware({
target: "https://api.openai.com",
mutations: [replacePath, addKey, finalizeBody],
mutations: [replacePath, filterModelParameters, addKey, finalizeBody],
blockingResponseHandler: openaiImagesResponseHandler,
});
@@ -116,6 +185,17 @@ openaiImagesRouter.post(
}),
openaiImagesProxy
);
// Add support for the /v1/images/edits endpoint (used by gpt-image-1 for image editing)
openaiImagesRouter.post(
"/v1/images/edits",
ipLimiter,
createPreprocessorMiddleware({
inApi: "openai-image",
outApi: "openai-image",
service: "openai",
}),
openaiImagesProxy
);
openaiImagesRouter.post(
"/v1/chat/completions",
ipLimiter,
+242 -20
View File
@@ -1,20 +1,58 @@
import { z } from "zod";
import { Request } from "express";
import { OpenAIV1ChatCompletionSchema } from "./openai";
import { APIFormatTransformer } from "./index";
// Extend the Express Request type to include multimodal content
declare global {
namespace Express {
interface Request {
multimodalContent?: {
prompt?: string;
images?: string[];
};
}
}
}
// https://platform.openai.com/docs/api-reference/images/create
export const OpenAIV1ImagesGenerationSchema = z
.object({
prompt: z.string().max(4000),
prompt: z.string().max(32000), // gpt-image-1 supports up to 32000 chars
model: z.string().max(100).optional(),
quality: z.enum(["standard", "hd"]).optional().default("standard"),
n: z.number().int().min(1).max(4).optional().default(1),
response_format: z.enum(["url", "b64_json"]).optional(),
// Support for image inputs (multimodal capability of gpt-image-1)
image: z.union([
z.string(), // single image (base64 or URL)
z.array(z.string()) // array of images
]).optional(),
mask: z.string().optional(), // mask image for editing
// Different quality options based on model
quality: z
.union([
z.enum(["standard", "hd"]), // dall-e-3 options
z.enum(["high", "medium", "low"]), // gpt-image-1 options
z.literal("auto") // default for gpt-image-1
])
.optional()
.default("standard"),
n: z.number().int().min(1).max(10).optional().default(1), // gpt-image-1 supports up to 10
response_format: z.enum(["url", "b64_json"]).optional(), // Note: gpt-image-1 always returns b64_json
// Enhanced size options for gpt-image-1
size: z
.enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"])
.union([
// dalle models
z.enum(["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"]),
// gpt-image-1 models (adds landscape, portrait, auto)
z.enum(["1024x1024", "1536x1024", "1024x1536", "auto"])
])
.optional()
.default("1024x1024"),
style: z.enum(["vivid", "natural"]).optional().default("vivid"),
style: z.enum(["vivid", "natural"]).optional().default("vivid"), // dall-e-3 only
// New gpt-image-1 specific parameters
background: z.enum(["transparent", "opaque", "auto"]).optional(), // gpt-image-1 only
moderation: z.enum(["low", "auto"]).optional(), // gpt-image-1 only
output_compression: z.number().int().min(0).max(100).optional(), // gpt-image-1 only
output_format: z.enum(["png", "jpeg", "webp"]).optional(), // gpt-image-1 only
user: z.string().max(500).optional(),
})
.strip();
@@ -34,9 +72,41 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer<
}
const { messages } = result.data;
const prompt = messages.filter((m) => m.role === "user").pop()?.content;
if (Array.isArray(prompt)) {
throw new Error("Image generation prompt must be a text message.");
const userMessage = messages.filter((m) => m.role === "user").pop();
if (!userMessage) {
throw new Error("No user message found in the request.");
}
const content = userMessage.content;
// Handle array content (multimodal content with text and images)
if (Array.isArray(content)) {
const textParts: string[] = [];
const imageParts: string[] = [];
// Process content parts, extracting text and images
content.forEach(part => {
if (typeof part === 'string') {
textParts.push(part);
} else if (part.type === 'image_url') {
// Extract image URL or base64 data from the content
const imageUrl = typeof part.image_url === 'string'
? part.image_url
: part.image_url.url;
imageParts.push(imageUrl);
}
});
// Join all text parts to form the prompt
const prompt = textParts.join('\n');
// For gpt-image-1, we'll pass both the text prompt and image(s)
req.multimodalContent = {
prompt,
images: imageParts
};
} else if (typeof content !== 'string') {
throw new Error("Image generation prompt must be a text message or multimodal content.");
}
if (body.stream) {
@@ -49,20 +119,172 @@ export const transformOpenAIToOpenAIImage: APIFormatTransformer<
// character name or wrapping the entire thing in quotes. We will look for
// the index of "Image:" and use everything after that as the prompt.
const index = prompt?.toLowerCase().indexOf("image:");
if (index === -1 || !prompt) {
throw new Error(
`Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${prompt}).`
);
// For multimodal requests (image editing with gpt-image-1), we don't require the "Image:" prefix
const isMultimodalRequest = Array.isArray(content) && req.multimodalContent?.images && req.multimodalContent.images.length > 0;
// Only enforce the "Image:" prefix for non-multimodal requests
if (!isMultimodalRequest && typeof content === 'string') {
const textIndex = content.toLowerCase().indexOf("image:");
if (textIndex === -1) {
throw new Error(
`Start your prompt with 'Image:' followed by a description of the image you want to generate (received: ${content}).`
);
}
}
// TODO: Add some way to specify parameters via chat message
const transformed = {
model: body.model.includes("dall-e") ? body.model : "dall-e-3",
quality: "standard",
size: "1024x1024",
response_format: "url",
prompt: prompt.slice(index! + 6).trim(),
// Determine which model to use (gpt-image-1 or dall-e-3)
const isGptImage = body.model?.includes("gpt-image") || false;
// Get the correct text prompt either from multimodal content or plain string content
let textPrompt: string | undefined;
let index = -1;
if (Array.isArray(content)) {
textPrompt = req.multimodalContent?.prompt;
} else if (typeof content === 'string') {
index = content.toLowerCase().indexOf("image:");
textPrompt = index !== -1 ? content.slice(index + 6).trim() : content;
}
// Validate that we have a text prompt
if (!textPrompt) {
throw new Error("No text prompt found in the request.");
}
// Determine the exact model being used
let modelName = "dall-e-2"; // Default
if (isGptImage) {
modelName = "gpt-image-1";
} else if (body.model?.includes("dall-e-3")) {
modelName = "dall-e-3";
} else if (body.model?.includes("dall-e-2")) {
modelName = "dall-e-2";
} else {
// If no specific model requested, default to dall-e-3
modelName = "dall-e-3";
}
// Start with basic parameters common to all models
const transformed: any = {
model: modelName,
prompt: textPrompt,
};
// Add model-specific parameters
if (modelName === "gpt-image-1") {
// GPT Image specific parameters
transformed.quality = "auto"; // Default quality for gpt-image-1
transformed.size = "1024x1024"; // Default size (square)
transformed.moderation = "low"; // Always set moderation to low for gpt-image-1
// Optional GPT Image parameters
if (body.background) transformed.background = body.background;
if (body.output_format) transformed.output_format = body.output_format;
if (body.output_compression) transformed.output_compression = body.output_compression;
// Handle specific quality settings for gpt-image-1
if (body.quality && ["high", "medium", "low", "auto"].includes(body.quality)) {
transformed.quality = body.quality;
}
// Handle specific size settings for gpt-image-1
if (body.size && ["1024x1024", "1536x1024", "1024x1536", "auto"].includes(body.size)) {
transformed.size = body.size;
}
// No response_format for gpt-image-1 as it always returns b64_json
} else if (modelName === "dall-e-3") {
// DALL-E 3 specific parameters
transformed.size = "1024x1024"; // Default size
transformed.response_format = "url"; // Default format
transformed.quality = "standard"; // Default quality
// Handle DALL-E 3 style parameter
if (body.style && ["vivid", "natural"].includes(body.style)) {
transformed.style = body.style;
} else {
transformed.style = "vivid"; // Default style
}
// Handle specific quality settings for dall-e-3
if (body.quality && ["standard", "hd"].includes(body.quality)) {
transformed.quality = body.quality;
}
// Handle specific size settings for dall-e-3
if (body.size && ["1024x1024", "1792x1024", "1024x1792"].includes(body.size)) {
transformed.size = body.size;
}
} else {
// DALL-E 2 specific parameters
transformed.size = "1024x1024"; // Default size
transformed.response_format = "url"; // Default format
// NO quality parameter for dall-e-2
// Explicitly remove the quality parameter before sending
delete transformed.quality;
// Handle specific size settings for dall-e-2
if (body.size && ["256x256", "512x512", "1024x1024"].includes(body.size)) {
transformed.size = body.size;
}
}
// Handle common parameters
if (body.n && !isNaN(parseInt(body.n))) {
// For dall-e-3, only n=1 is supported
if (modelName === "dall-e-3" && parseInt(body.n) > 1) {
transformed.n = 1;
} else {
transformed.n = parseInt(body.n);
}
}
// Handle response_format for non-gpt-image models
if (!isGptImage && body.response_format && ["url", "b64_json"].includes(body.response_format)) {
transformed.response_format = body.response_format;
}
// If this is gpt-image-1 and we have image content, add it to the transformed request
if (isGptImage && req.multimodalContent?.images && req.multimodalContent.images.length > 0) {
// For the edit endpoint, we need to format the images properly
transformed.image = req.multimodalContent.images.length === 1
? req.multimodalContent.images[0]
: req.multimodalContent.images;
// Any request with images for gpt-image-1 should use the edits endpoint
req.log.info(`${req.multimodalContent.images.length} image(s) detected for gpt-image-1, using images/edits endpoint`);
if (req.path.startsWith("/v1/chat/completions")) {
req.url = req.url.replace("/v1/chat/completions", "/v1/images/edits");
}
}
// For dall-e-2, we need to make sure we don't introduce unsupported parameters
// due to default values in the schema. Let's bypass Zod schema validation here
// for dall-e-2 and only include the supported parameters.
if (modelName === "dall-e-2") {
// Only include parameters that dall-e-2 supports
const filteredTransformed: any = {};
// List of parameters supported by dall-e-2
const supportedParams = [
"model", "prompt", "n", "size", "response_format", "user"
];
// Copy only supported parameters
for (const param of supportedParams) {
if (transformed[param] !== undefined) {
filteredTransformed[param] = transformed[param];
}
}
// Log what we're sending
req.log.info({ params: Object.keys(filteredTransformed) }, "Filtered parameters for dall-e-2");
return filteredTransformed;
}
// For other models, use the schema as normal
return OpenAIV1ImagesGenerationSchema.parse(transformed);
};
@@ -13,9 +13,19 @@ export type OpenAIImageGenerationResult = {
created: number;
data: {
revised_prompt?: string;
url: string;
b64_json: string;
url?: string; // gpt-image-1 doesn't return URLs, only b64_json
b64_json?: string;
}[];
// Added for gpt-image-1 responses
usage?: {
total_tokens: number;
input_tokens: number;
output_tokens: number;
input_tokens_details?: {
text_tokens: number;
image_tokens: number;
};
};
};
async function downloadImage(url: string) {
@@ -65,11 +75,16 @@ export async function mirrorGeneratedImage(
let mirror: string;
if (item.b64_json) {
mirror = await saveB64Image(item.b64_json);
} else {
} else if (item.url) {
mirror = await downloadImage(item.url);
} else {
req.log.warn("No image data found in response");
continue;
}
// Set the URL to our mirrored version
item.url = `${host}/user_content/${path.basename(mirror)}`;
await createThumbnail(mirror);
// Add to image history with the local URL
addToImageHistory({
url: item.url,
prompt,
@@ -84,6 +84,7 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {
"azure-o3Tokens": 0,
"azure-o4-miniTokens": 0,
"azure-dall-eTokens": 0,
"azure-gpt-imageTokens": 0,
modelIds: [],
};
this.keys.push(newKey);
@@ -124,6 +124,7 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
"o3Tokens": 0,
"o4-miniTokens": 0,
"dall-eTokens": 0,
"gpt-imageTokens": 0,
modelIds: [],
};
this.keys.push(newKey);
+8 -2
View File
@@ -34,7 +34,8 @@ export type OpenAIModelFamily =
| "o3-mini"
| "o3"
| "o4-mini"
| "dall-e";
| "dall-e"
| "gpt-image";
export type AnthropicModelFamily = "claude" | "claude-opus";
export type GoogleAIModelFamily =
| "gemini-flash"
@@ -84,6 +85,7 @@ export const MODEL_FAMILIES = (<A extends readonly ModelFamily[]>(
"o3",
"o4-mini",
"dall-e",
"gpt-image",
"claude",
"claude-opus",
"gemini-flash",
@@ -117,6 +119,7 @@ export const MODEL_FAMILIES = (<A extends readonly ModelFamily[]>(
"azure-o3-mini",
"azure-o3",
"azure-o4-mini",
"azure-gpt-image",
] as const);
export const LLM_SERVICES = (<A extends readonly LLMService[]>(
@@ -154,6 +157,7 @@ export const MODEL_FAMILY_SERVICE: {
"o3": "openai",
"o4-mini": "openai",
"dall-e": "openai",
"gpt-image": "openai",
claude: "anthropic",
"claude-opus": "anthropic",
"aws-claude": "aws",
@@ -180,6 +184,7 @@ export const MODEL_FAMILY_SERVICE: {
"azure-o3-mini": "azure",
"azure-o3": "azure",
"azure-o4-mini": "azure",
"azure-gpt-image": "azure",
"gemini-flash": "google-ai",
"gemini-pro": "google-ai",
"gemini-ultra": "google-ai",
@@ -189,9 +194,10 @@ export const MODEL_FAMILY_SERVICE: {
"mistral-large": "mistral-ai",
};
export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e"];
export const IMAGE_GEN_MODELS: ModelFamily[] = ["dall-e", "azure-dall-e", "gpt-image", "azure-gpt-image"];
export const OPENAI_MODEL_FAMILY_MAP: { [regex: string]: OpenAIModelFamily } = {
"^gpt-image(-\\d+)?(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt-image",
"^gpt-4\\.5(-preview)?(-\\d{4}-\\d{2}-\\d{2})?$": "gpt45",
"^gpt-4\\.1(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41",
"^gpt-4\\.1-mini(-\\d{4}-\\d{2}-\\d{2})?$": "gpt41-mini",
+10
View File
@@ -83,6 +83,16 @@ export function getTokenCostUsd(model: ModelFamily, tokens: number) {
case "dall-e":
cost = 0.00001;
break;
case "azure-gpt-image":
case "gpt-image":
// gpt-image-1 pricing:
// Text input tokens: $5 per 1M tokens
// Image input tokens: $10 per 1M tokens
// Image output tokens: $40 per 1M tokens
// Weighted average assuming a mix of text/image input and output
// Typical cost is $0.02-$0.19 per image depending on quality
cost = 0.000018; // Balanced estimate accounting for input/output mix
break;
case "aws-claude":
case "gcp-claude":
case "claude":