|
@@ -53,6 +53,17 @@ const upload = multer({
|
|
|
}
|
|
|
});
|
|
|
|
|
|
+// 全局浏览器实例
|
|
|
+let globalBrowser = null;
|
|
|
+
|
|
|
+async function initBrowser() {
|
|
|
+ if (!globalBrowser) {
|
|
|
+ globalBrowser = await chromium.launch({headless: true});
|
|
|
+ console.log('Browser initialized');
|
|
|
+ }
|
|
|
+ return globalBrowser;
|
|
|
+}
|
|
|
+
|
|
|
router.route("/login").all(async (req, res, next) => {
|
|
|
loginHandler(req, res)
|
|
|
});
|
|
@@ -1323,54 +1334,53 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
|
|
|
if (!requestUrl) {
|
|
|
return res.status(400).json({error: 'Missing url'});
|
|
|
}
|
|
|
+ const browser = await initBrowser();
|
|
|
+ const page = await browser.newPage();
|
|
|
try {
|
|
|
- const browser = await chromium.launch({headless: true});
|
|
|
- const page = await browser.newPage();
|
|
|
- await page.goto(requestUrl, {waitUntil: 'networkidle'});
|
|
|
- const html = await page.content();
|
|
|
- await browser.close();
|
|
|
- // 新增:HTML 转 Markdown
|
|
|
- const turndownService = new TurndownService();
|
|
|
- // 用 cheerio 只提取正文部分
|
|
|
- const cheerioDoc = cheerio.load(html);
|
|
|
- let mainHtml = cheerioDoc('main').html() || cheerioDoc('article').html() || cheerioDoc('#content').html() || cheerioDoc('.content').html() || cheerioDoc('.post').html() || cheerioDoc('body').html();
|
|
|
- // 用 cheerio 过滤无关标签
|
|
|
- const cheerioMain = cheerio.load(mainHtml || '');
|
|
|
- cheerioMain('style, script, noscript, iframe, link, header, footer, nav, form, .ads, .advertisement').remove();
|
|
|
- mainHtml = cheerioMain.html();
|
|
|
- const markdownContent = turndownService.turndown(mainHtml);
|
|
|
- // 新增:解析 <title> 标签内容
|
|
|
- let title = '';
|
|
|
- const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
|
|
|
- if (titleMatch && titleMatch[1]) {
|
|
|
- title = titleMatch[1].trim();
|
|
|
- }
|
|
|
- let finalTitle;
|
|
|
- if (title && title.trim()) {
|
|
|
- finalTitle = title + '.md';
|
|
|
- } else {
|
|
|
- finalTitle = 'untitled-' + uuidv4() + '.md';
|
|
|
- }
|
|
|
- console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
|
|
|
-
|
|
|
- // 解析用户传入的参数
|
|
|
- const {
|
|
|
- collection_ids,
|
|
|
- id: customId,
|
|
|
- metadata: customMetadata,
|
|
|
- ingestion_mode,
|
|
|
- run_with_orchestration
|
|
|
- } = req.body;
|
|
|
-
|
|
|
- // 处理 metadata 字段
|
|
|
- let finalMetadata = customMetadata;
|
|
|
- if (typeof finalMetadata === 'string') {
|
|
|
- try {
|
|
|
- finalMetadata = JSON.parse(finalMetadata);
|
|
|
- } catch (e) {
|
|
|
- return res.status(400).json({error: 'metadata 不是合法的 JSON 字符串'});
|
|
|
+ console.log("开始抓取页面:", requestUrl);
|
|
|
+ // await page.goto(requestUrl, {waitUntil: 'networkidle'});
|
|
|
+ await page.goto(requestUrl);
|
|
|
+ const html = await page.content();
|
|
|
+ // 新增:HTML 转 Markdown
|
|
|
+ const turndownService = new TurndownService();
|
|
|
+ // 用 cheerio 只提取正文部分
|
|
|
+ const cheerioDoc = cheerio.load(html);
|
|
|
+ let mainHtml = cheerioDoc('main').html() || cheerioDoc('article').html() || cheerioDoc('#content').html() || cheerioDoc('.content').html() || cheerioDoc('.post').html() || cheerioDoc('body').html();
|
|
|
+ // 用 cheerio 过滤无关标签
|
|
|
+ const cheerioMain = cheerio.load(mainHtml || '');
|
|
|
+ cheerioMain('style, script, noscript, iframe, link, header, footer, nav, form, .ads, .advertisement').remove();
|
|
|
+ mainHtml = cheerioMain.html();
|
|
|
+ const markdownContent = turndownService.turndown(mainHtml);
|
|
|
+ // 新增:解析 <title> 标签内容
|
|
|
+ let title = '';
|
|
|
+ const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
|
|
|
+ if (titleMatch && titleMatch[1]) {
|
|
|
+ title = titleMatch[1].trim();
|
|
|
+ }
|
|
|
+ let finalTitle;
|
|
|
+ if (title && title.trim()) {
|
|
|
+ finalTitle = title + '.md';
|
|
|
+ } else {
|
|
|
+ finalTitle = 'untitled-' + uuidv4() + '.md';
|
|
|
+ }
|
|
|
+ console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
|
|
|
+ // 解析用户传入的参数
|
|
|
+ const {
|
|
|
+ collection_ids,
|
|
|
+ id: customId,
|
|
|
+ metadata: customMetadata,
|
|
|
+ ingestion_mode,
|
|
|
+ run_with_orchestration
|
|
|
+ } = req.body;
|
|
|
+ // 处理 metadata 字段
|
|
|
+ let finalMetadata = customMetadata;
|
|
|
+ if (typeof finalMetadata === 'string') {
|
|
|
+ try {
|
|
|
+ finalMetadata = JSON.parse(finalMetadata);
|
|
|
+ } catch (e) {
|
|
|
+ return res.status(400).json({ error: 'metadata 不是合法的 JSON 字符串' });
|
|
|
+ }
|
|
|
}
|
|
|
- }
|
|
|
// if (!finalMetadata) {
|
|
|
// finalMetadata = {
|
|
|
// title: title || uuidv4(),
|
|
@@ -1380,62 +1390,65 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
|
|
|
// // upload_id: Date.now() + '-' + Math.random()
|
|
|
// };
|
|
|
// }
|
|
|
- // 处理 collection_ids 字段
|
|
|
- let finalCollectionIds = collection_ids;
|
|
|
- if (typeof finalCollectionIds === 'string') {
|
|
|
- try {
|
|
|
- finalCollectionIds = JSON.parse(finalCollectionIds);
|
|
|
- } catch (e) {
|
|
|
- // 不是合法 JSON 字符串就当作普通字符串
|
|
|
+ // 处理 collection_ids 字段
|
|
|
+ let finalCollectionIds = collection_ids;
|
|
|
+ if (typeof finalCollectionIds === 'string') {
|
|
|
+ try {
|
|
|
+ finalCollectionIds = JSON.parse(finalCollectionIds);
|
|
|
+ } catch (e) {
|
|
|
+ // 不是合法 JSON 字符串就当作普通字符串
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 如果 metadata 里没有 collection_ids,但用户传了 collection_ids,则加进去
|
|
|
+ if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
|
|
|
+ finalMetadata.collection_ids = finalCollectionIds;
|
|
|
}
|
|
|
- }
|
|
|
- // 如果 metadata 里没有 collection_ids,但用户传了 collection_ids,则加进去
|
|
|
- if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
|
|
|
- finalMetadata.collection_ids = finalCollectionIds;
|
|
|
- }
|
|
|
|
|
|
- // 补全 finalMetadata.title 和 content_type
|
|
|
- if (!finalMetadata.title) {
|
|
|
+ // 补全 finalMetadata.title 和 content_type
|
|
|
+ if (!finalMetadata.title) {
|
|
|
finalMetadata.title = finalTitle;
|
|
|
- }
|
|
|
- finalMetadata.content_type = 'markdown';
|
|
|
+ }
|
|
|
+ finalMetadata.content_type = 'markdown';
|
|
|
|
|
|
- // 用 axios 发送 multipart/form-data 方式上传到 r2r
|
|
|
- const FormData = require('form-data');
|
|
|
- const fs = require('fs');
|
|
|
- const path = require('path');
|
|
|
+ // 用 axios 发送 multipart/form-data 方式上传到 r2r
|
|
|
+ const FormData = require('form-data');
|
|
|
+ const fs = require('fs');
|
|
|
+ const path = require('path');
|
|
|
const tmpFilePath = path.join(__dirname, `${title || 'untitled'}-${uuidv4()}.md`);
|
|
|
- fs.writeFileSync(tmpFilePath, markdownContent, 'utf8');
|
|
|
- const form = new FormData();
|
|
|
+ fs.writeFileSync(tmpFilePath, markdownContent, 'utf8');
|
|
|
+ const form = new FormData();
|
|
|
form.append('file', fs.createReadStream(tmpFilePath), finalTitle);
|
|
|
- if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
|
|
|
- form.append('collection_ids', JSON.stringify(finalCollectionIds));
|
|
|
- }
|
|
|
- if (customId) {
|
|
|
- form.append('id', customId);
|
|
|
- }
|
|
|
- form.append('metadata', JSON.stringify(finalMetadata));
|
|
|
- form.append('ingestion_mode', ingestion_mode || 'fast');
|
|
|
- form.append('run_with_orchestration', 'false');
|
|
|
- // 打印上传参数,方便排查问题
|
|
|
+ if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
|
|
|
+ form.append('collection_ids', JSON.stringify(finalCollectionIds));
|
|
|
+ }
|
|
|
+ if (customId) {
|
|
|
+ form.append('id', customId);
|
|
|
+ }
|
|
|
+ form.append('metadata', JSON.stringify(finalMetadata));
|
|
|
+ form.append('ingestion_mode', ingestion_mode || 'fast');
|
|
|
+ form.append('run_with_orchestration', 'false');
|
|
|
+ // 打印上传参数,方便排查问题
|
|
|
console.log('Request Headers:', form.getHeaders());
|
|
|
- const url = 'https://r2rserver.cocorobo.cn/v3/documents';
|
|
|
- const response = await axios.post(url, form, {
|
|
|
- headers: {
|
|
|
- ...form.getHeaders(),
|
|
|
+ const url = 'https://r2rserver.cocorobo.cn/v3/documents';
|
|
|
+ const response = await axios.post(url, form, {
|
|
|
+ headers: {
|
|
|
+ ...form.getHeaders(),
|
|
|
// 'User-Agent': 'NodeClient/1.0'
|
|
|
- }
|
|
|
- });
|
|
|
- fs.unlinkSync(tmpFilePath);
|
|
|
- result = {
|
|
|
+ }
|
|
|
+ });
|
|
|
+ fs.unlinkSync(tmpFilePath);
|
|
|
+ result = {
|
|
|
results: response.data.results,
|
|
|
title: finalTitle
|
|
|
+ }
|
|
|
+ res.json(result);
|
|
|
+ } catch (error) {
|
|
|
+ console.error('Message:', error.message);
|
|
|
+ console.error('Stack:', error.stack);
|
|
|
+ res.status(500).json({error: error.message});
|
|
|
+ } finally {
|
|
|
+ await page.close(); // 确保页面总是被关闭
|
|
|
}
|
|
|
- res.json(result);
|
|
|
- } catch (error) {
|
|
|
- console.error('Error occurred:', error);
|
|
|
- res.status(500).json({error: error.message});
|
|
|
- }
|
|
|
});
|
|
|
|
|
|
module.exports = router;
|