3 Commits

Author SHA1 Message Date
nai-degen bf13a8b524 tweaks type 2023-08-28 03:52:04 -05:00
nai-degen 6453dae433 adds horrible wip git persistence layer 2023-08-24 14:17:09 -05:00
nai-degen 80ecbd78df adds HF datasets configs 2023-08-24 12:36:57 -05:00
3 changed files with 186 additions and 3 deletions
+4
View File
@@ -43,6 +43,10 @@ ANTHROPIC_KEY=sk-ant-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# You can set an admin key for user management when using user_token gatekeeper.
# ADMIN_KEY=your-very-secret-key
# These are used to push data to a Huggingface Dataset repository.
# HF_DATASET_REPO_URL=https://huggingface.co/datasets/your-username/your-dataset-name
# HF_PRIVATE_SSH_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# These are used to persist user data to Firebase across restarts.
# FIREBASE_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# FIREBASE_RTDB_URL=https://xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.firebaseio.com
+15 -3
View File
@@ -10,7 +10,7 @@ const isDev = process.env.NODE_ENV !== "production";
type PromptLoggingBackend = "google_sheets";
type Config = {
export type Config = {
/** The port the proxy server will listen on. */
port: number;
/** Comma-delimited list of OpenAI API keys. */
@@ -47,13 +47,21 @@ type Config = {
* `memory`: Users are stored in memory and are lost on restart (default)
*
* `firebase_rtdb`: Users are stored in a Firebase Realtime Database; requires
* `firebaseKey` and `firebaseRtdbUrl` to be set.
* `firebaseKey` and `firebaseRtdbUrl` to be set. (deprecated)
*
* `huggingface_datasets`: Users are stored in a Huggingface Datasets git
* repository; requires `hfDatasetRepoUrl` and `hfPrivateSshKey` to be set.
**/
gatekeeperStore: "memory" | "firebase_rtdb";
gatekeeperStore: "memory" | "firebase_rtdb" | "huggingface_datasets";
/** URL of the Firebase Realtime Database if using the Firebase RTDB store. */
firebaseRtdbUrl?: string;
/** Base64-encoded Firebase service account key if using the Firebase RTDB store. */
firebaseKey?: string;
/** URL of the Huggingface Datasets git repository if using the Huggingface
* Datasets store. */
hfDatasetRepoUrl?: string;
/** Private SSH key used to push to the Huggingface Dataset repository. */
hfPrivateSshKey?: string;
/**
* Maximum number of IPs per user, after which their token is disabled.
* Users with the manually-assigned `special` role are exempt from this limit.
@@ -132,6 +140,8 @@ export const config: Config = {
maxIpsPerUser: getEnvWithDefault("MAX_IPS_PER_USER", 0),
firebaseRtdbUrl: getEnvWithDefault("FIREBASE_RTDB_URL", undefined),
firebaseKey: getEnvWithDefault("FIREBASE_KEY", undefined),
hfDatasetRepoUrl: getEnvWithDefault("HF_DATASET_REPO_URL", undefined),
hfPrivateSshKey: getEnvWithDefault("HF_PRIVATE_SSH_KEY", undefined),
modelRateLimit: getEnvWithDefault("MODEL_RATE_LIMIT", 4),
maxContextTokensOpenAI: getEnvWithDefault("MAX_CONTEXT_TOKENS_OPENAI", 0),
maxContextTokensAnthropic: getEnvWithDefault(
@@ -270,6 +280,8 @@ export const OMITTED_KEYS: (keyof Config)[] = [
"googleSheetsKey",
"firebaseKey",
"firebaseRtdbUrl",
"hfDatasetRepoUrl",
"hfPrivateSshKey",
"gatekeeperStore",
"maxIpsPerUser",
"blockedOrigins",
+167
View File
@@ -0,0 +1,167 @@
/**
* Very scuffed persistence system using a Huggingface's Datasets git repo as a
* file system. We use this because it's free and everyone is already deploying
* to Huggingface's Spaces feature anyway, so they can easily create a Dataset
* repository too rather than having to find some other place to host files.
*
* We periodically commit to the repo, and then pull from it when we need to
* read data. This is a bit slow, but it's fine for our purposes.
*/
import fs from "fs";
import os from "os";
import path from "path";
import { spawn } from "child_process";
import { config, Config } from "./config";
import { logger } from "./logger";
const log = logger.child({ module: "dataset-persistence" });
let singleton: DatasetPersistence | null = null;
class DatasetPersistence {
private initialized: boolean = false;
private keyPath = `${os.tmpdir()}/id_rsa`;
private repoPath = `${os.tmpdir()}/oai-proxy-dataset`;
private repoUrl!: string;
private sshKey!: string;
constructor() {
if (singleton) return singleton;
if (config.gatekeeperStore !== "huggingface_datasets") return;
DatasetPersistence.assertConfigured(config);
this.repoUrl = config.hfDatasetRepoUrl;
this.sshKey = config.hfPrivateSshKey.trim();
singleton = this;
}
async init() {
if (this.initialized) return;
log.info(
{ repoUrl: this.repoUrl, keyPath: this.keyPath, repoPath: this.repoPath },
"Initializing Huggingface Datasets persistence."
);
try {
this.setupSshKey();
await this.runGit(
"config user.email 'oai-proxy-persistence@example.com'"
);
await this.runGit("config user.name 'Proxy Persistence'");
log.info("Cloning repo...");
const cloneOutput = await this.runGit(
`clone --depth 1 ${this.repoUrl} ${this.repoPath}`
);
log.info({ output: cloneOutput.toString() }, "Cloned repo.");
// Test write access
const pushOutput = this.runGit("push").toString();
if (pushOutput !== "Everything up-to-date") {
log.error({ output: pushOutput }, "Unexpected output from git push.");
throw new Error("Unable to push to repo.");
}
log.info("Datasets configuration looks good.");
} catch (e) {
log.error(
{ error: e },
"Failed to initialize Huggingface Datasets persistence."
);
throw e;
}
this.initialized = true;
}
async get(key: string): Promise<Buffer | null> {
try {
await this.init();
this.runGit(`checkout HEAD -- ${key}`);
const filePath = path.join(this.repoPath, key);
return fs.promises.readFile(filePath);
} catch (e) {
log.error({ error: e }, "Failed to get key from Dataset repo.");
return null;
}
}
async set(key: string, value: Buffer) {
try {
await this.init();
await fs.promises.writeFile(`${this.repoPath}/${key}`, value);
// TODO: Need to set up LFS for >10MB files
if (fs.statSync(`${this.repoPath}/${key}`).size > 10 * 1024 * 1024) {
throw new Error("File too large for non-LFS storage.");
}
await this.runGit(`add ${key}`);
await this.runGit(`commit -m "Update ${key}"`);
await this.runGit("push");
} catch (e) {
log.error({ error: e }, "Failed to set key in Dataset repo.");
}
}
protected async cleanup() {
try {
await this.init();
await this.runGit("fetch --depth 1");
await this.runGit("reset --hard FETCH_HEAD");
} catch (e) {
log.error({ error: e }, "Failed to cleanup Dataset repo.");
}
}
protected async setupSshKey() {
fs.writeFileSync(this.keyPath, this.sshKey);
fs.chmodSync(this.keyPath, 0o600);
await this.runGit(`config core.sshCommand 'ssh -i ${this.keyPath}'`);
}
protected async runGit(command: string) {
const cmd = `git -C ${this.repoPath} ${command}`;
log.debug({ command: cmd }, "Running git command.");
return new Promise<string>((resolve, reject) => {
const proc = spawn(cmd, { shell: true });
const stdout: string[] = [];
const stderr: string[] = [];
proc.stdout.on("data", (data) => stdout.push(data.toString()));
proc.stderr.on("data", (data) => stderr.push(data.toString()));
proc.on("close", (code) => {
if (code !== 0) {
const errorOutput = stderr.join("");
log.error({ code, errorOutput }, "Git command failed.");
reject(
new Error(
`Git command failed with exit code ${code}: ${errorOutput}`
)
);
} else {
resolve(stdout.join(""));
}
});
});
}
static assertConfigured(input: Config): asserts input is ConfigWithDatasets {
if (!input.hfDatasetRepoUrl) {
throw new Error("HF_DATASET_REPO_URL is required when using Datasets.");
}
if (!input.hfPrivateSshKey) {
throw new Error("HF_PRIVATE_SSH_KEY is required when using Datasets.");
}
}
}
type ConfigWithDatasets = Config & {
hfDatasetRepoUrl: string;
hfPrivateSshKey: string;
};
export { DatasetPersistence };