168 lines
5.0 KiB
TypeScript
168 lines
5.0 KiB
TypeScript
/**
|
|
* Very scuffed persistence system using a Huggingface's Datasets git repo as a
|
|
* file system. We use this because it's free and everyone is already deploying
|
|
* to Huggingface's Spaces feature anyway, so they can easily create a Dataset
|
|
* repository too rather than having to find some other place to host files.
|
|
*
|
|
* We periodically commit to the repo, and then pull from it when we need to
|
|
* read data. This is a bit slow, but it's fine for our purposes.
|
|
*/
|
|
import fs from "fs";
|
|
import os from "os";
|
|
import path from "path";
|
|
import { spawn } from "child_process";
|
|
import { config, Config } from "./config";
|
|
import { logger } from "./logger";
|
|
|
|
const log = logger.child({ module: "dataset-persistence" });
|
|
|
|
let singleton: DatasetPersistence | null = null;
|
|
|
|
class DatasetPersistence {
|
|
private initialized: boolean = false;
|
|
private keyPath = `${os.tmpdir()}/id_rsa`;
|
|
private repoPath = `${os.tmpdir()}/oai-proxy-dataset`;
|
|
|
|
private repoUrl!: string;
|
|
private sshKey!: string;
|
|
|
|
constructor() {
|
|
if (singleton) return singleton;
|
|
if (config.gatekeeperStore !== "huggingface_datasets") return;
|
|
DatasetPersistence.assertConfigured(config);
|
|
this.repoUrl = config.hfDatasetRepoUrl;
|
|
this.sshKey = config.hfPrivateSshKey.trim();
|
|
singleton = this;
|
|
}
|
|
|
|
async init() {
|
|
if (this.initialized) return;
|
|
|
|
log.info(
|
|
{ repoUrl: this.repoUrl, keyPath: this.keyPath, repoPath: this.repoPath },
|
|
"Initializing Huggingface Datasets persistence."
|
|
);
|
|
|
|
try {
|
|
this.setupSshKey();
|
|
|
|
await this.runGit(
|
|
"config user.email 'oai-proxy-persistence@example.com'"
|
|
);
|
|
await this.runGit("config user.name 'Proxy Persistence'");
|
|
log.info("Cloning repo...");
|
|
const cloneOutput = await this.runGit(
|
|
`clone --depth 1 ${this.repoUrl} ${this.repoPath}`
|
|
);
|
|
log.info({ output: cloneOutput.toString() }, "Cloned repo.");
|
|
|
|
// Test write access
|
|
const pushOutput = this.runGit("push").toString();
|
|
if (pushOutput !== "Everything up-to-date") {
|
|
log.error({ output: pushOutput }, "Unexpected output from git push.");
|
|
throw new Error("Unable to push to repo.");
|
|
}
|
|
log.info("Datasets configuration looks good.");
|
|
} catch (e) {
|
|
log.error(
|
|
{ error: e },
|
|
"Failed to initialize Huggingface Datasets persistence."
|
|
);
|
|
throw e;
|
|
}
|
|
|
|
this.initialized = true;
|
|
}
|
|
|
|
async get(key: string): Promise<Buffer | null> {
|
|
try {
|
|
await this.init();
|
|
this.runGit(`checkout HEAD -- ${key}`);
|
|
const filePath = path.join(this.repoPath, key);
|
|
return fs.promises.readFile(filePath);
|
|
} catch (e) {
|
|
log.error({ error: e }, "Failed to get key from Dataset repo.");
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async set(key: string, value: Buffer) {
|
|
try {
|
|
await this.init();
|
|
|
|
await fs.promises.writeFile(`${this.repoPath}/${key}`, value);
|
|
|
|
// TODO: Need to set up LFS for >10MB files
|
|
if (fs.statSync(`${this.repoPath}/${key}`).size > 10 * 1024 * 1024) {
|
|
throw new Error("File too large for non-LFS storage.");
|
|
}
|
|
|
|
await this.runGit(`add ${key}`);
|
|
await this.runGit(`commit -m "Update ${key}"`);
|
|
await this.runGit("push");
|
|
} catch (e) {
|
|
log.error({ error: e }, "Failed to set key in Dataset repo.");
|
|
}
|
|
}
|
|
|
|
protected async cleanup() {
|
|
try {
|
|
await this.init();
|
|
await this.runGit("fetch --depth 1");
|
|
await this.runGit("reset --hard FETCH_HEAD");
|
|
} catch (e) {
|
|
log.error({ error: e }, "Failed to cleanup Dataset repo.");
|
|
}
|
|
}
|
|
|
|
protected async setupSshKey() {
|
|
fs.writeFileSync(this.keyPath, this.sshKey);
|
|
fs.chmodSync(this.keyPath, 0o600);
|
|
await this.runGit(`config core.sshCommand 'ssh -i ${this.keyPath}'`);
|
|
}
|
|
|
|
protected async runGit(command: string) {
|
|
const cmd = `git -C ${this.repoPath} ${command}`;
|
|
log.debug({ command: cmd }, "Running git command.");
|
|
return new Promise<string>((resolve, reject) => {
|
|
const proc = spawn(cmd, { shell: true });
|
|
const stdout: string[] = [];
|
|
const stderr: string[] = [];
|
|
|
|
proc.stdout.on("data", (data) => stdout.push(data.toString()));
|
|
proc.stderr.on("data", (data) => stderr.push(data.toString()));
|
|
|
|
proc.on("close", (code) => {
|
|
if (code !== 0) {
|
|
const errorOutput = stderr.join("");
|
|
log.error({ code, errorOutput }, "Git command failed.");
|
|
reject(
|
|
new Error(
|
|
`Git command failed with exit code ${code}: ${errorOutput}`
|
|
)
|
|
);
|
|
} else {
|
|
resolve(stdout.join(""));
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
static assertConfigured(input: Config): asserts input is ConfigWithDatasets {
|
|
if (!input.hfDatasetRepoUrl) {
|
|
throw new Error("HF_DATASET_REPO_URL is required when using Datasets.");
|
|
}
|
|
|
|
if (!input.hfPrivateSshKey) {
|
|
throw new Error("HF_PRIVATE_SSH_KEY is required when using Datasets.");
|
|
}
|
|
}
|
|
}
|
|
|
|
type ConfigWithDatasets = Config & {
|
|
hfDatasetRepoUrl: string;
|
|
hfPrivateSshKey: string;
|
|
};
|
|
|
|
export { DatasetPersistence };
|