mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
116 lines
4.2 KiB
JavaScript
Executable File
116 lines
4.2 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
/**
|
|
* Parse Ollama models from HTML (sorted by newest for stable ordering)
|
|
*
|
|
* Usage:
|
|
* 1. Fetch HTML: curl -s "https://ollama.com/library?sort=newest" -o /tmp/ollama-newest.html
|
|
* 2. Parse: node .claude/scripts/parse-ollama-models.js
|
|
*
|
|
* Outputs: pipe-delimited format: modelName|pulls|capabilities|sizes
|
|
* Example: deepseek-r1|66200000|tools,thinking|1.5b,7b,8b,14b,32b,70b,671b
|
|
*
|
|
* Filtering rules:
|
|
* - Top 30 newest models are always included (regardless of pull count)
|
|
* - After top 30, only models with 50K+ pulls are included
|
|
* - Models with 'cloud' capability are always excluded
|
|
* - Models with 'embedding' capability are always excluded
|
|
*
|
|
* Pull counts are rounded to significant figures for stable diffs:
|
|
* - >=10M: round to 100K (e.g., 109,123,456 -> 109,100,000)
|
|
* - >=1M: round to 10K (e.g., 5,432,100 -> 5,430,000)
|
|
* - <1M: round to 1K (e.g., 88,700 -> 89,000)
|
|
*/
|
|
|
|
const fs = require('fs');
|
|
const os = require('os');
|
|
const path = require('path');
|
|
|
|
const htmlPath = process.argv[2] || path.join(os.tmpdir(), 'ollama-newest.html');
|
|
const TOP_N_ALWAYS_INCLUDE = 30;
|
|
const MIN_PULLS_THRESHOLD = 50000;
|
|
|
|
if (!fs.existsSync(htmlPath)) {
|
|
console.error(`Error: HTML file not found at ${htmlPath}`);
|
|
console.error('Please fetch it first with:');
|
|
console.error(' curl -s "https://ollama.com/library?sort=newest" -o /tmp/ollama-newest.html');
|
|
process.exit(1);
|
|
}
|
|
|
|
const html = fs.readFileSync(htmlPath, 'utf8');
|
|
|
|
// Split into model sections - each starts with <a href="/library/
|
|
const modelSections = html.split(/<a href="\/library\//);
|
|
const allParsedModels = [];
|
|
|
|
for (let i = 1; i < modelSections.length; i++) {
|
|
const section = modelSections[i].substring(0, 5000); // Large enough window to capture all data
|
|
|
|
// Extract model name (first quoted string)
|
|
const nameMatch = section.match(/^([^"]+)"/);
|
|
if (!nameMatch) continue;
|
|
const name = nameMatch[1];
|
|
|
|
// Extract pulls using x-test-pull-count
|
|
const pullsMatch = section.match(/x-test-pull-count>([^<]+)</);
|
|
let pulls = 0;
|
|
if (pullsMatch) {
|
|
const pullStr = pullsMatch[1].replace(/,/g, '');
|
|
if (pullStr.includes('M')) {
|
|
pulls = Math.floor(parseFloat(pullStr) * 1000000);
|
|
} else if (pullStr.includes('K')) {
|
|
pulls = Math.floor(parseFloat(pullStr) * 1000);
|
|
} else {
|
|
pulls = parseInt(pullStr);
|
|
}
|
|
}
|
|
|
|
// Extract capabilities (tools, vision, embedding, thinking, cloud)
|
|
const capabilities = [];
|
|
const capabilityRegex = /x-test-capability[^>]*>([^<]+)</g;
|
|
let capMatch;
|
|
while ((capMatch = capabilityRegex.exec(section)) !== null) {
|
|
capabilities.push(capMatch[1].trim());
|
|
}
|
|
|
|
// Extract sizes (1.5b, 7b, etc.)
|
|
const sizes = [];
|
|
const sizeRegex = /x-test-size[^>]*>([^<]+)</g;
|
|
let sizeMatch;
|
|
while ((sizeMatch = sizeRegex.exec(section)) !== null) {
|
|
sizes.push(sizeMatch[1].trim());
|
|
}
|
|
|
|
// Skip models with 'cloud' or 'embedding' capability
|
|
if (capabilities.includes('cloud') || capabilities.includes('embedding')) {
|
|
continue;
|
|
}
|
|
|
|
allParsedModels.push({ name, pulls: roundPulls(pulls), capabilities, sizes });
|
|
}
|
|
|
|
// Apply filtering: top 30 always included, rest need 50K+ pulls
|
|
const models = allParsedModels.filter((model, index) => {
|
|
return index < TOP_N_ALWAYS_INCLUDE || model.pulls >= MIN_PULLS_THRESHOLD;
|
|
});
|
|
|
|
/**
|
|
* Round pulls to significant figures for stable output.
|
|
* This reduces churn from daily fluctuations while preserving magnitude.
|
|
*/
|
|
function roundPulls(pulls) {
|
|
if (pulls >= 10000000) return Math.round(pulls / 100000) * 100000; // >=10M: round to 100K
|
|
if (pulls >= 1000000) return Math.round(pulls / 10000) * 10000; // >=1M: round to 10K
|
|
return Math.round(pulls / 1000) * 1000; // <1M: round to 1K
|
|
}
|
|
|
|
// Output in pipe-delimited format (in the order they appear on the page)
|
|
models.forEach(m => {
|
|
const caps = m.capabilities.join(',');
|
|
const tags = m.sizes.join(',');
|
|
console.log(`${m.name}|${m.pulls}|${caps}|${tags}`);
|
|
});
|
|
|
|
const topNCount = Math.min(TOP_N_ALWAYS_INCLUDE, allParsedModels.length);
|
|
const thresholdCount = models.length - topNCount;
|
|
console.error(`\nTotal models: ${models.length} (top ${topNCount} newest + ${thresholdCount} with ${MIN_PULLS_THRESHOLD / 1000}K+ pulls)`);
|