|
@@ -25,6 +25,8 @@ const axios = require('axios');
|
|
|
const multer = require("multer");
|
|
|
const {chromium} = require('playwright');
|
|
|
const TurndownService = require('turndown');
|
|
|
+const cheerio = require('cheerio');
|
|
|
+const { v4: uuidv4 } = require('uuid');
|
|
|
|
|
|
const loginHandler = require('./login'); // 确保路径正确
|
|
|
const { r2rClient } = require("r2r-js");
|
|
@@ -1317,26 +1319,38 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
|
|
|
// console.log('收到的 req.body:', req.body);
|
|
|
// console.log('收到的 req.files:', req.files);
|
|
|
// console.log('收到的 headers:', req.headers);
|
|
|
- const url = req.body.url; // 前端传入 url
|
|
|
- if (!url) {
|
|
|
+ const requestUrl = req.body.url; // 前端传入 url
|
|
|
+ if (!requestUrl) {
|
|
|
return res.status(400).json({error: 'Missing url'});
|
|
|
}
|
|
|
try {
|
|
|
const browser = await chromium.launch({headless: true});
|
|
|
const page = await browser.newPage();
|
|
|
- await page.goto(url, {waitUntil: 'networkidle'});
|
|
|
+ await page.goto(requestUrl, {waitUntil: 'networkidle'});
|
|
|
const html = await page.content();
|
|
|
await browser.close();
|
|
|
// 新增:HTML 转 Markdown
|
|
|
const turndownService = new TurndownService();
|
|
|
- const markdownContent = turndownService.turndown(html);
|
|
|
+ // 用 cheerio 只提取正文部分
|
|
|
+ const cheerioDoc = cheerio.load(html);
|
|
|
+ let mainHtml = cheerioDoc('main').html() || cheerioDoc('article').html() || cheerioDoc('#content').html() || cheerioDoc('.content').html() || cheerioDoc('.post').html() || cheerioDoc('body').html();
|
|
|
+ // 用 cheerio 过滤无关标签
|
|
|
+ const cheerioMain = cheerio.load(mainHtml || '');
|
|
|
+ cheerioMain('style, script, noscript, iframe, link, header, footer, nav, form, .ads, .advertisement').remove();
|
|
|
+ mainHtml = cheerioMain.html();
|
|
|
+ const markdownContent = turndownService.turndown(mainHtml);
|
|
|
// 新增:解析 <title> 标签内容
|
|
|
let title = '';
|
|
|
const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
|
|
|
if (titleMatch && titleMatch[1]) {
|
|
|
title = titleMatch[1].trim();
|
|
|
}
|
|
|
-
|
|
|
+ let finalTitle;
|
|
|
+ if (title && title.trim()) {
|
|
|
+ finalTitle = title + '.md';
|
|
|
+ } else {
|
|
|
+ finalTitle = 'untitled-' + uuidv4() + '.md';
|
|
|
+ }
|
|
|
console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
|
|
|
|
|
|
// 解析用户传入的参数
|
|
@@ -1357,15 +1371,15 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
|
|
|
return res.status(400).json({error: 'metadata 不是合法的 JSON 字符串'});
|
|
|
}
|
|
|
}
|
|
|
- if (!finalMetadata) {
|
|
|
- finalMetadata = {
|
|
|
- title: title || "抓取的网页内容",
|
|
|
- source_url: url,
|
|
|
- content_type: "markdown",
|
|
|
- scraped_at: new Date().toISOString(),
|
|
|
- upload_id: Date.now() + '-' + Math.random()
|
|
|
- };
|
|
|
- }
|
|
|
+ // if (!finalMetadata) {
|
|
|
+ // finalMetadata = {
|
|
|
+ // title: title || uuidv4(),
|
|
|
+ // // source_url: url,
|
|
|
+ // content_type: "markdown",
|
|
|
+ // // scraped_at: new Date().toISOString(),
|
|
|
+ // // upload_id: Date.now() + '-' + Math.random()
|
|
|
+ // };
|
|
|
+ // }
|
|
|
// 处理 collection_ids 字段
|
|
|
let finalCollectionIds = collection_ids;
|
|
|
if (typeof finalCollectionIds === 'string') {
|
|
@@ -1380,68 +1394,44 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
|
|
|
finalMetadata.collection_ids = finalCollectionIds;
|
|
|
}
|
|
|
|
|
|
- // 组装 create 参数
|
|
|
- const createParams = {
|
|
|
- raw_text: markdownContent,
|
|
|
- metadata: finalMetadata
|
|
|
- };
|
|
|
+ // 补全 finalMetadata.title 和 content_type
|
|
|
+ if (!finalMetadata.title) {
|
|
|
+ finalMetadata.title = finalTitle;
|
|
|
+ }
|
|
|
+ finalMetadata.content_type = 'markdown';
|
|
|
+
|
|
|
+ // 用 axios 发送 multipart/form-data 方式上传到 r2r
|
|
|
+ const FormData = require('form-data');
|
|
|
+ const fs = require('fs');
|
|
|
+ const path = require('path');
|
|
|
+ const tmpFilePath = path.join(__dirname, `${title || 'untitled'}-${uuidv4()}.md`);
|
|
|
+ fs.writeFileSync(tmpFilePath, markdownContent, 'utf8');
|
|
|
+ const form = new FormData();
|
|
|
+ form.append('file', fs.createReadStream(tmpFilePath), finalTitle);
|
|
|
if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
|
|
|
- createParams.collection_ids = finalCollectionIds;
|
|
|
+ form.append('collection_ids', JSON.stringify(finalCollectionIds));
|
|
|
}
|
|
|
if (customId) {
|
|
|
- createParams.id = customId;
|
|
|
- }
|
|
|
- if (ingestion_mode) {
|
|
|
- createParams.ingestion_mode = ingestion_mode;
|
|
|
- } else {
|
|
|
- createParams.ingestionMode = "fast";
|
|
|
- }
|
|
|
- if (typeof run_with_orchestration !== 'undefined') {
|
|
|
- createParams.run_with_orchestration = run_with_orchestration;
|
|
|
+ form.append('id', customId);
|
|
|
}
|
|
|
-
|
|
|
- // 4. 将 markdown 内容上传到 R2R
|
|
|
- console.log("正在上传文档...");
|
|
|
- try {
|
|
|
- const documentResponse = await client.documents.create(createParams);
|
|
|
- console.log("文档上传成功!");
|
|
|
- console.log("文档ID:", documentResponse.results.documentId);
|
|
|
- result = {
|
|
|
- results: documentResponse.results,
|
|
|
- title: title
|
|
|
+ form.append('metadata', JSON.stringify(finalMetadata));
|
|
|
+ form.append('ingestion_mode', ingestion_mode || 'fast');
|
|
|
+ form.append('run_with_orchestration', 'false');
|
|
|
+ // 打印上传参数,方便排查问题
|
|
|
+ console.log('Request Headers:', form.getHeaders());
|
|
|
+ const url = 'https://r2rserver.cocorobo.cn/v3/documents';
|
|
|
+ const response = await axios.post(url, form, {
|
|
|
+ headers: {
|
|
|
+ ...form.getHeaders(),
|
|
|
+ // 'User-Agent': 'NodeClient/1.0'
|
|
|
}
|
|
|
- res.json(result);
|
|
|
- } catch (error) {
|
|
|
- console.error('Error occurred:', error);
|
|
|
- res.status(500).json({error: error.message});
|
|
|
- // 如果 409 冲突,尝试删除后重试
|
|
|
- // if (error.message && error.message.includes('409')) {
|
|
|
- // // 从错误信息中提取 documentId
|
|
|
- // const match = error.message.match(/Document ([\w-]+) is currently ingesting/);
|
|
|
- // const conflictDocId = match && match[1];
|
|
|
- // if (conflictDocId) {
|
|
|
- // try {
|
|
|
- // await client.documents.delete({id: conflictDocId});
|
|
|
- // console.log('冲突时已删除重复文档:', conflictDocId);
|
|
|
- // // 再次尝试上传
|
|
|
- // const documentResponse = await client.documents.create(createParams);
|
|
|
- // result = {
|
|
|
- // results: documentResponse.results,
|
|
|
- // title: title
|
|
|
- // }
|
|
|
- // res.json(result);
|
|
|
- // return;
|
|
|
- // } catch (e2) {
|
|
|
- // console.error('冲突时删除重试也失败:', e2);
|
|
|
- // throw e2;
|
|
|
- // }
|
|
|
- // } else {
|
|
|
- // throw error;
|
|
|
- // }
|
|
|
- // } else {
|
|
|
- // throw error;
|
|
|
- // }
|
|
|
+ });
|
|
|
+ fs.unlinkSync(tmpFilePath);
|
|
|
+ result = {
|
|
|
+ results: response.data.results,
|
|
|
+ title: finalTitle
|
|
|
}
|
|
|
+ res.json(result);
|
|
|
} catch (error) {
|
|
|
console.error('Error occurred:', error);
|
|
|
res.status(500).json({error: error.message});
|