mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
Browse: improve markdown transform
(cherry picked from commit 3623eef47f)
This commit is contained in:
Generated
+160
@@ -29,6 +29,7 @@
|
||||
"@vercel/analytics": "^1.2.2",
|
||||
"@vercel/speed-insights": "^1.0.10",
|
||||
"browser-fs-access": "^0.35.0",
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"eventsource-parser": "^1.1.2",
|
||||
"idb-keyval": "^6.2.1",
|
||||
"next": "~14.1.4",
|
||||
@@ -2671,6 +2672,11 @@
|
||||
"resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz",
|
||||
"integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw=="
|
||||
},
|
||||
"node_modules/boolbase": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
|
||||
@@ -2836,6 +2842,42 @@
|
||||
"url": "https://github.com/sponsors/wooorm"
|
||||
}
|
||||
},
|
||||
"node_modules/cheerio": {
|
||||
"version": "1.0.0-rc.12",
|
||||
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz",
|
||||
"integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==",
|
||||
"dependencies": {
|
||||
"cheerio-select": "^2.1.0",
|
||||
"dom-serializer": "^2.0.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.0.1",
|
||||
"htmlparser2": "^8.0.1",
|
||||
"parse5": "^7.0.0",
|
||||
"parse5-htmlparser2-tree-adapter": "^7.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/cheerio-select": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
|
||||
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0",
|
||||
"css-select": "^5.1.0",
|
||||
"css-what": "^6.1.0",
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.0.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/chownr": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz",
|
||||
@@ -2994,6 +3036,32 @@
|
||||
"tiny-invariant": "^1.0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/css-select": {
|
||||
"version": "5.1.0",
|
||||
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz",
|
||||
"integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0",
|
||||
"css-what": "^6.1.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"domutils": "^3.0.1",
|
||||
"nth-check": "^2.0.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/css-what": {
|
||||
"version": "6.1.0",
|
||||
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz",
|
||||
"integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==",
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/csstype": {
|
||||
"version": "3.1.3",
|
||||
"resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
|
||||
@@ -3222,11 +3290,62 @@
|
||||
"csstype": "^3.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/dom-serializer": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
||||
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"entities": "^4.2.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domelementtype": {
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
||||
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
]
|
||||
},
|
||||
"node_modules/domhandler": {
|
||||
"version": "5.0.3",
|
||||
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
||||
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domino": {
|
||||
"version": "2.1.6",
|
||||
"resolved": "https://registry.npmjs.org/domino/-/domino-2.1.6.tgz",
|
||||
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
|
||||
},
|
||||
"node_modules/domutils": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
|
||||
"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
|
||||
"dependencies": {
|
||||
"dom-serializer": "^2.0.0",
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/duplexer": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz",
|
||||
@@ -4673,6 +4792,24 @@
|
||||
"url": "https://opencollective.com/unified"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2": {
|
||||
"version": "8.0.2",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
|
||||
"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
|
||||
"funding": [
|
||||
"https://github.com/fb55/htmlparser2?sponsor=1",
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
],
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.0.1",
|
||||
"entities": "^4.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
|
||||
@@ -6559,6 +6696,17 @@
|
||||
"resolved": "https://registry.npmjs.org/nprogress/-/nprogress-0.2.0.tgz",
|
||||
"integrity": "sha512-I19aIingLgR1fmhftnbWWO3dXc0hSxqHQHQb3H8m+K3TnEn/iSeTZZOyvKXWqQESMwuUVnatlCnZdLBZZt2VSA=="
|
||||
},
|
||||
"node_modules/nth-check": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
|
||||
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/object-assign": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||
@@ -6818,6 +6966,18 @@
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5-htmlparser2-tree-adapter": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz",
|
||||
"integrity": "sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==",
|
||||
"dependencies": {
|
||||
"domhandler": "^5.0.2",
|
||||
"parse5": "^7.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/path-exists": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
"@vercel/analytics": "^1.2.2",
|
||||
"@vercel/speed-insights": "^1.0.10",
|
||||
"browser-fs-access": "^0.35.0",
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"eventsource-parser": "^1.1.2",
|
||||
"idb-keyval": "^6.2.1",
|
||||
"next": "~14.1.4",
|
||||
|
||||
@@ -2,11 +2,8 @@ import { z } from 'zod';
|
||||
import { TRPCError } from '@trpc/server';
|
||||
|
||||
import { BrowserContext, connect, ScreenshotOptions, TimeoutError } from '@cloudflare/puppeteer';
|
||||
|
||||
/**
|
||||
* Puppeteer implementation of the worker
|
||||
*/
|
||||
import TurndownService from 'turndown';
|
||||
import { default as TurndownService } from 'turndown';
|
||||
import { load as cheerioLoad } from 'cheerio';
|
||||
|
||||
import { createTRPCRouter, publicProcedure } from '~/server/api/trpc.server';
|
||||
import { env } from '~/server/env.mjs';
|
||||
@@ -155,15 +152,9 @@ async function workerPuppeteer(
|
||||
result.content = await page.evaluate(() => document.body.innerText || document.textContent || '');
|
||||
break;
|
||||
case 'markdown':
|
||||
await page.evaluate(() => {
|
||||
// Remove unnecessary elements
|
||||
document.querySelectorAll('script, style, nav, footer, aside, header, .ads, .comments')
|
||||
.forEach(el => el.remove());
|
||||
});
|
||||
const cleanedHtml = await page.content();
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
});
|
||||
const html = await page.content();
|
||||
const cleanedHtml = cleanHtml(html);
|
||||
const turndownService = new TurndownService({ headingStyle: 'atx' });
|
||||
result.content = turndownService.turndown(cleanedHtml);
|
||||
break;
|
||||
}
|
||||
@@ -226,3 +217,35 @@ async function workerPuppeteer(
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
function cleanHtml(html: string) {
|
||||
const $ = cheerioLoad(html);
|
||||
|
||||
// Remove standard unwanted elements
|
||||
$('script, style, nav, aside, noscript, iframe, svg, canvas, .ads, .comments, link[rel="stylesheet"]').remove();
|
||||
|
||||
// Remove elements that might be specific to proxy services or injected by them
|
||||
$('[id^="brightdata-"], [class^="brightdata-"]').remove();
|
||||
|
||||
// Remove comments
|
||||
$('*').contents().filter(function() {
|
||||
return this.type === 'comment';
|
||||
}).remove();
|
||||
|
||||
// Remove empty elements
|
||||
$('p, div, span').each(function() {
|
||||
if ($(this).text().trim() === '' && $(this).children().length === 0) {
|
||||
$(this).remove();
|
||||
}
|
||||
});
|
||||
|
||||
// Merge consecutive paragraphs
|
||||
$('p + p').each(function() {
|
||||
$(this).prev().append(' ' + $(this).text());
|
||||
$(this).remove();
|
||||
});
|
||||
|
||||
// Return the cleaned HTML
|
||||
return $.html();
|
||||
}
|
||||
Reference in New Issue
Block a user