Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bf13a8b524 | |||
| 6453dae433 | |||
| 80ecbd78df |
@@ -43,6 +43,10 @@ ANTHROPIC_KEY=sk-ant-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
|||||||
# You can set an admin key for user management when using user_token gatekeeper.
|
# You can set an admin key for user management when using user_token gatekeeper.
|
||||||
# ADMIN_KEY=your-very-secret-key
|
# ADMIN_KEY=your-very-secret-key
|
||||||
|
|
||||||
|
# These are used to push data to a Huggingface Dataset repository.
|
||||||
|
# HF_DATASET_REPO_URL=https://huggingface.co/datasets/your-username/your-dataset-name
|
||||||
|
# HF_PRIVATE_SSH_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
|
|
||||||
# These are used to persist user data to Firebase across restarts.
|
# These are used to persist user data to Firebase across restarts.
|
||||||
# FIREBASE_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
# FIREBASE_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
# FIREBASE_RTDB_URL=https://xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.firebaseio.com
|
# FIREBASE_RTDB_URL=https://xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.firebaseio.com
|
||||||
|
|||||||
+15
-3
@@ -10,7 +10,7 @@ const isDev = process.env.NODE_ENV !== "production";
|
|||||||
|
|
||||||
type PromptLoggingBackend = "google_sheets";
|
type PromptLoggingBackend = "google_sheets";
|
||||||
|
|
||||||
type Config = {
|
export type Config = {
|
||||||
/** The port the proxy server will listen on. */
|
/** The port the proxy server will listen on. */
|
||||||
port: number;
|
port: number;
|
||||||
/** Comma-delimited list of OpenAI API keys. */
|
/** Comma-delimited list of OpenAI API keys. */
|
||||||
@@ -47,13 +47,21 @@ type Config = {
|
|||||||
* `memory`: Users are stored in memory and are lost on restart (default)
|
* `memory`: Users are stored in memory and are lost on restart (default)
|
||||||
*
|
*
|
||||||
* `firebase_rtdb`: Users are stored in a Firebase Realtime Database; requires
|
* `firebase_rtdb`: Users are stored in a Firebase Realtime Database; requires
|
||||||
* `firebaseKey` and `firebaseRtdbUrl` to be set.
|
* `firebaseKey` and `firebaseRtdbUrl` to be set. (deprecated)
|
||||||
|
*
|
||||||
|
* `huggingface_datasets`: Users are stored in a Huggingface Datasets git
|
||||||
|
* repository; requires `hfDatasetRepoUrl` and `hfPrivateSshKey` to be set.
|
||||||
**/
|
**/
|
||||||
gatekeeperStore: "memory" | "firebase_rtdb";
|
gatekeeperStore: "memory" | "firebase_rtdb" | "huggingface_datasets";
|
||||||
/** URL of the Firebase Realtime Database if using the Firebase RTDB store. */
|
/** URL of the Firebase Realtime Database if using the Firebase RTDB store. */
|
||||||
firebaseRtdbUrl?: string;
|
firebaseRtdbUrl?: string;
|
||||||
/** Base64-encoded Firebase service account key if using the Firebase RTDB store. */
|
/** Base64-encoded Firebase service account key if using the Firebase RTDB store. */
|
||||||
firebaseKey?: string;
|
firebaseKey?: string;
|
||||||
|
/** URL of the Huggingface Datasets git repository if using the Huggingface
|
||||||
|
* Datasets store. */
|
||||||
|
hfDatasetRepoUrl?: string;
|
||||||
|
/** Private SSH key used to push to the Huggingface Dataset repository. */
|
||||||
|
hfPrivateSshKey?: string;
|
||||||
/**
|
/**
|
||||||
* Maximum number of IPs per user, after which their token is disabled.
|
* Maximum number of IPs per user, after which their token is disabled.
|
||||||
* Users with the manually-assigned `special` role are exempt from this limit.
|
* Users with the manually-assigned `special` role are exempt from this limit.
|
||||||
@@ -132,6 +140,8 @@ export const config: Config = {
|
|||||||
maxIpsPerUser: getEnvWithDefault("MAX_IPS_PER_USER", 0),
|
maxIpsPerUser: getEnvWithDefault("MAX_IPS_PER_USER", 0),
|
||||||
firebaseRtdbUrl: getEnvWithDefault("FIREBASE_RTDB_URL", undefined),
|
firebaseRtdbUrl: getEnvWithDefault("FIREBASE_RTDB_URL", undefined),
|
||||||
firebaseKey: getEnvWithDefault("FIREBASE_KEY", undefined),
|
firebaseKey: getEnvWithDefault("FIREBASE_KEY", undefined),
|
||||||
|
hfDatasetRepoUrl: getEnvWithDefault("HF_DATASET_REPO_URL", undefined),
|
||||||
|
hfPrivateSshKey: getEnvWithDefault("HF_PRIVATE_SSH_KEY", undefined),
|
||||||
modelRateLimit: getEnvWithDefault("MODEL_RATE_LIMIT", 4),
|
modelRateLimit: getEnvWithDefault("MODEL_RATE_LIMIT", 4),
|
||||||
maxContextTokensOpenAI: getEnvWithDefault("MAX_CONTEXT_TOKENS_OPENAI", 0),
|
maxContextTokensOpenAI: getEnvWithDefault("MAX_CONTEXT_TOKENS_OPENAI", 0),
|
||||||
maxContextTokensAnthropic: getEnvWithDefault(
|
maxContextTokensAnthropic: getEnvWithDefault(
|
||||||
@@ -270,6 +280,8 @@ export const OMITTED_KEYS: (keyof Config)[] = [
|
|||||||
"googleSheetsKey",
|
"googleSheetsKey",
|
||||||
"firebaseKey",
|
"firebaseKey",
|
||||||
"firebaseRtdbUrl",
|
"firebaseRtdbUrl",
|
||||||
|
"hfDatasetRepoUrl",
|
||||||
|
"hfPrivateSshKey",
|
||||||
"gatekeeperStore",
|
"gatekeeperStore",
|
||||||
"maxIpsPerUser",
|
"maxIpsPerUser",
|
||||||
"blockedOrigins",
|
"blockedOrigins",
|
||||||
|
|||||||
@@ -0,0 +1,167 @@
|
|||||||
|
/**
|
||||||
|
* Very scuffed persistence system using a Huggingface's Datasets git repo as a
|
||||||
|
* file system. We use this because it's free and everyone is already deploying
|
||||||
|
* to Huggingface's Spaces feature anyway, so they can easily create a Dataset
|
||||||
|
* repository too rather than having to find some other place to host files.
|
||||||
|
*
|
||||||
|
* We periodically commit to the repo, and then pull from it when we need to
|
||||||
|
* read data. This is a bit slow, but it's fine for our purposes.
|
||||||
|
*/
|
||||||
|
import fs from "fs";
|
||||||
|
import os from "os";
|
||||||
|
import path from "path";
|
||||||
|
import { spawn } from "child_process";
|
||||||
|
import { config, Config } from "./config";
|
||||||
|
import { logger } from "./logger";
|
||||||
|
|
||||||
|
const log = logger.child({ module: "dataset-persistence" });
|
||||||
|
|
||||||
|
let singleton: DatasetPersistence | null = null;
|
||||||
|
|
||||||
|
class DatasetPersistence {
|
||||||
|
private initialized: boolean = false;
|
||||||
|
private keyPath = `${os.tmpdir()}/id_rsa`;
|
||||||
|
private repoPath = `${os.tmpdir()}/oai-proxy-dataset`;
|
||||||
|
|
||||||
|
private repoUrl!: string;
|
||||||
|
private sshKey!: string;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
if (singleton) return singleton;
|
||||||
|
if (config.gatekeeperStore !== "huggingface_datasets") return;
|
||||||
|
DatasetPersistence.assertConfigured(config);
|
||||||
|
this.repoUrl = config.hfDatasetRepoUrl;
|
||||||
|
this.sshKey = config.hfPrivateSshKey.trim();
|
||||||
|
singleton = this;
|
||||||
|
}
|
||||||
|
|
||||||
|
async init() {
|
||||||
|
if (this.initialized) return;
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
{ repoUrl: this.repoUrl, keyPath: this.keyPath, repoPath: this.repoPath },
|
||||||
|
"Initializing Huggingface Datasets persistence."
|
||||||
|
);
|
||||||
|
|
||||||
|
try {
|
||||||
|
this.setupSshKey();
|
||||||
|
|
||||||
|
await this.runGit(
|
||||||
|
"config user.email 'oai-proxy-persistence@example.com'"
|
||||||
|
);
|
||||||
|
await this.runGit("config user.name 'Proxy Persistence'");
|
||||||
|
log.info("Cloning repo...");
|
||||||
|
const cloneOutput = await this.runGit(
|
||||||
|
`clone --depth 1 ${this.repoUrl} ${this.repoPath}`
|
||||||
|
);
|
||||||
|
log.info({ output: cloneOutput.toString() }, "Cloned repo.");
|
||||||
|
|
||||||
|
// Test write access
|
||||||
|
const pushOutput = this.runGit("push").toString();
|
||||||
|
if (pushOutput !== "Everything up-to-date") {
|
||||||
|
log.error({ output: pushOutput }, "Unexpected output from git push.");
|
||||||
|
throw new Error("Unable to push to repo.");
|
||||||
|
}
|
||||||
|
log.info("Datasets configuration looks good.");
|
||||||
|
} catch (e) {
|
||||||
|
log.error(
|
||||||
|
{ error: e },
|
||||||
|
"Failed to initialize Huggingface Datasets persistence."
|
||||||
|
);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.initialized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async get(key: string): Promise<Buffer | null> {
|
||||||
|
try {
|
||||||
|
await this.init();
|
||||||
|
this.runGit(`checkout HEAD -- ${key}`);
|
||||||
|
const filePath = path.join(this.repoPath, key);
|
||||||
|
return fs.promises.readFile(filePath);
|
||||||
|
} catch (e) {
|
||||||
|
log.error({ error: e }, "Failed to get key from Dataset repo.");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async set(key: string, value: Buffer) {
|
||||||
|
try {
|
||||||
|
await this.init();
|
||||||
|
|
||||||
|
await fs.promises.writeFile(`${this.repoPath}/${key}`, value);
|
||||||
|
|
||||||
|
// TODO: Need to set up LFS for >10MB files
|
||||||
|
if (fs.statSync(`${this.repoPath}/${key}`).size > 10 * 1024 * 1024) {
|
||||||
|
throw new Error("File too large for non-LFS storage.");
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.runGit(`add ${key}`);
|
||||||
|
await this.runGit(`commit -m "Update ${key}"`);
|
||||||
|
await this.runGit("push");
|
||||||
|
} catch (e) {
|
||||||
|
log.error({ error: e }, "Failed to set key in Dataset repo.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected async cleanup() {
|
||||||
|
try {
|
||||||
|
await this.init();
|
||||||
|
await this.runGit("fetch --depth 1");
|
||||||
|
await this.runGit("reset --hard FETCH_HEAD");
|
||||||
|
} catch (e) {
|
||||||
|
log.error({ error: e }, "Failed to cleanup Dataset repo.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected async setupSshKey() {
|
||||||
|
fs.writeFileSync(this.keyPath, this.sshKey);
|
||||||
|
fs.chmodSync(this.keyPath, 0o600);
|
||||||
|
await this.runGit(`config core.sshCommand 'ssh -i ${this.keyPath}'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected async runGit(command: string) {
|
||||||
|
const cmd = `git -C ${this.repoPath} ${command}`;
|
||||||
|
log.debug({ command: cmd }, "Running git command.");
|
||||||
|
return new Promise<string>((resolve, reject) => {
|
||||||
|
const proc = spawn(cmd, { shell: true });
|
||||||
|
const stdout: string[] = [];
|
||||||
|
const stderr: string[] = [];
|
||||||
|
|
||||||
|
proc.stdout.on("data", (data) => stdout.push(data.toString()));
|
||||||
|
proc.stderr.on("data", (data) => stderr.push(data.toString()));
|
||||||
|
|
||||||
|
proc.on("close", (code) => {
|
||||||
|
if (code !== 0) {
|
||||||
|
const errorOutput = stderr.join("");
|
||||||
|
log.error({ code, errorOutput }, "Git command failed.");
|
||||||
|
reject(
|
||||||
|
new Error(
|
||||||
|
`Git command failed with exit code ${code}: ${errorOutput}`
|
||||||
|
)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
resolve(stdout.join(""));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
static assertConfigured(input: Config): asserts input is ConfigWithDatasets {
|
||||||
|
if (!input.hfDatasetRepoUrl) {
|
||||||
|
throw new Error("HF_DATASET_REPO_URL is required when using Datasets.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!input.hfPrivateSshKey) {
|
||||||
|
throw new Error("HF_PRIVATE_SSH_KEY is required when using Datasets.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type ConfigWithDatasets = Config & {
|
||||||
|
hfDatasetRepoUrl: string;
|
||||||
|
hfPrivateSshKey: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export { DatasetPersistence };
|
||||||
Reference in New Issue
Block a user