ソースを参照

Merge branch 'feat-web2md-import'

# Conflicts:
#	package-lock.json
jimmylee 2 日 前
コミット
ec92ad260c
3 ファイル変更68 行追加71 行削除
  1. 7 1
      package-lock.json
  2. 1 0
      package.json
  3. 60 70
      pbl.js

+ 7 - 1
package-lock.json

@@ -10,6 +10,7 @@
       "dependencies": {
         "axios": "^1.7.9",
         "bcryptjs": "^2.4.3",
+        "cheerio": "^1.1.0",
         "crypto": "^1.0.1",
         "express": "^4.18.2",
         "file": "^0.2.2",
@@ -729,7 +730,8 @@
     "node_modules/@mixmark-io/domino": {
       "version": "2.2.0",
       "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz",
-      "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="
+      "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==",
+      "license": "BSD-2-Clause"
     },
     "node_modules/@rrweb/types": {
       "version": "2.0.0-alpha.17",
@@ -1871,6 +1873,7 @@
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
       "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
       "hasInstallScript": true,
+      "license": "MIT",
       "optional": true,
       "os": [
         "darwin"
@@ -3125,6 +3128,7 @@
       "version": "1.53.0",
       "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.53.0.tgz",
       "integrity": "sha512-ghGNnIEYZC4E+YtclRn4/p6oYbdPiASELBIYkBXfaTVKreQUYbMUYQDwS12a8F0/HtIjr/CkGjtwABeFPGcS4Q==",
+      "license": "Apache-2.0",
       "dependencies": {
         "playwright-core": "1.53.0"
       },
@@ -3142,6 +3146,7 @@
       "version": "1.53.0",
       "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.53.0.tgz",
       "integrity": "sha512-mGLg8m0pm4+mmtB7M89Xw/GSqoNC+twivl8ITteqvAndachozYe2ZA7srU6uleV1vEdAHYqjq+SV8SNxRRFYBw==",
+      "license": "Apache-2.0",
       "bin": {
         "playwright-core": "cli.js"
       },
@@ -3961,6 +3966,7 @@
       "version": "7.2.0",
       "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.0.tgz",
       "integrity": "sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==",
+      "license": "MIT",
       "dependencies": {
         "@mixmark-io/domino": "^2.2.0"
       }

+ 1 - 0
package.json

@@ -12,6 +12,7 @@
   "dependencies": {
     "axios": "^1.7.9",
     "bcryptjs": "^2.4.3",
+    "cheerio": "^1.1.0",
     "crypto": "^1.0.1",
     "express": "^4.18.2",
     "file": "^0.2.2",

+ 60 - 70
pbl.js

@@ -25,6 +25,8 @@ const axios = require('axios');
 const multer = require("multer");
 const {chromium} = require('playwright');
 const TurndownService = require('turndown');
+const cheerio = require('cheerio');
+const { v4: uuidv4 } = require('uuid');
 
 const loginHandler = require('./login'); // 确保路径正确
 const { r2rClient } = require("r2r-js");
@@ -1317,26 +1319,38 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
     // console.log('收到的 req.body:', req.body);
     // console.log('收到的 req.files:', req.files);
     // console.log('收到的 headers:', req.headers);
-    const url = req.body.url; // 前端传入 url
-    if (!url) {
+    const requestUrl = req.body.url; // 前端传入 url
+    if (!requestUrl) {
         return res.status(400).json({error: 'Missing url'});
     }
     try {
         const browser = await chromium.launch({headless: true});
         const page = await browser.newPage();
-        await page.goto(url, {waitUntil: 'networkidle'});
+        await page.goto(requestUrl, {waitUntil: 'networkidle'});
         const html = await page.content();
         await browser.close();
         // 新增:HTML 转 Markdown
         const turndownService = new TurndownService();
-        const markdownContent = turndownService.turndown(html);
+        // 用 cheerio 只提取正文部分
+        const cheerioDoc = cheerio.load(html);
+        let mainHtml = cheerioDoc('main').html() || cheerioDoc('article').html() || cheerioDoc('#content').html() || cheerioDoc('.content').html() || cheerioDoc('.post').html() || cheerioDoc('body').html();
+        // 用 cheerio 过滤无关标签
+        const cheerioMain = cheerio.load(mainHtml || '');
+        cheerioMain('style, script, noscript, iframe, link, header, footer, nav, form, .ads, .advertisement').remove();
+        mainHtml = cheerioMain.html();
+        const markdownContent = turndownService.turndown(mainHtml);
         // 新增:解析 <title> 标签内容
         let title = '';
         const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
         if (titleMatch && titleMatch[1]) {
             title = titleMatch[1].trim();
         }
-
+        let finalTitle;
+        if (title && title.trim()) {
+            finalTitle = title + '.md';
+        } else {
+            finalTitle = 'untitled-' + uuidv4() + '.md';
+        }
         console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
 
         // 解析用户传入的参数
@@ -1357,15 +1371,15 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
                 return res.status(400).json({error: 'metadata 不是合法的 JSON 字符串'});
             }
         }
-        if (!finalMetadata) {
-            finalMetadata = {
-                title: title || "抓取的网页内容",
-                source_url: url,
-                content_type: "markdown",
-                scraped_at: new Date().toISOString(),
-                upload_id: Date.now() + '-' + Math.random()
-            };
-        }
+        // if (!finalMetadata) {
+        //     finalMetadata = {
+        //         title: title || uuidv4(),
+        //         // source_url: url,
+        //         content_type: "markdown",
+        //         // scraped_at: new Date().toISOString(),
+        //         // upload_id: Date.now() + '-' + Math.random()
+        //     };
+        // }
         // 处理 collection_ids 字段
         let finalCollectionIds = collection_ids;
         if (typeof finalCollectionIds === 'string') {
@@ -1380,68 +1394,44 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
             finalMetadata.collection_ids = finalCollectionIds;
         }
 
-        // 组装 create 参数
-        const createParams = {
-            raw_text: markdownContent,
-            metadata: finalMetadata
-        };
+        // 补全 finalMetadata.title 和 content_type
+        if (!finalMetadata.title) {
+            finalMetadata.title = finalTitle;
+        }
+        finalMetadata.content_type = 'markdown';
+
+        // 用 axios 发送 multipart/form-data 方式上传到 r2r
+        const FormData = require('form-data');
+        const fs = require('fs');
+        const path = require('path');
+        const tmpFilePath = path.join(__dirname, `${title || 'untitled'}-${uuidv4()}.md`);
+        fs.writeFileSync(tmpFilePath, markdownContent, 'utf8');
+        const form = new FormData();
+        form.append('file', fs.createReadStream(tmpFilePath), finalTitle);
         if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
-            createParams.collection_ids = finalCollectionIds;
+            form.append('collection_ids', JSON.stringify(finalCollectionIds));
         }
         if (customId) {
-            createParams.id = customId;
-        }
-        if (ingestion_mode) {
-            createParams.ingestion_mode = ingestion_mode;
-        } else {
-            createParams.ingestionMode = "fast";
-        }
-        if (typeof run_with_orchestration !== 'undefined') {
-            createParams.run_with_orchestration = run_with_orchestration;
+            form.append('id', customId);
         }
-
-        // 4. 将 markdown 内容上传到 R2R
-        console.log("正在上传文档...");
-        try {
-            const documentResponse = await client.documents.create(createParams);
-            console.log("文档上传成功!");
-            console.log("文档ID:", documentResponse.results.documentId);
-            result = {
-                results: documentResponse.results,
-                title: title
+        form.append('metadata', JSON.stringify(finalMetadata));
+        form.append('ingestion_mode', ingestion_mode || 'fast');
+        form.append('run_with_orchestration', 'false');
+        // 打印上传参数,方便排查问题
+        console.log('Request Headers:', form.getHeaders());
+        const url = 'https://r2rserver.cocorobo.cn/v3/documents';
+        const response = await axios.post(url, form, {
+            headers: {
+                ...form.getHeaders(),
+                // 'User-Agent': 'NodeClient/1.0'
             }
-            res.json(result);
-        } catch (error) {
-            console.error('Error occurred:', error);
-            res.status(500).json({error: error.message});
-            // 如果 409 冲突,尝试删除后重试
-            // if (error.message && error.message.includes('409')) {
-            //     // 从错误信息中提取 documentId
-            //     const match = error.message.match(/Document ([\w-]+) is currently ingesting/);
-            //     const conflictDocId = match && match[1];
-            //     if (conflictDocId) {
-            //         try {
-            //             await client.documents.delete({id: conflictDocId});
-            //             console.log('冲突时已删除重复文档:', conflictDocId);
-            //             // 再次尝试上传
-            //             const documentResponse = await client.documents.create(createParams);
-            //             result = {
-            //                 results: documentResponse.results,
-            //                 title: title
-            //             }
-            //             res.json(result);
-            //             return;
-            //         } catch (e2) {
-            //             console.error('冲突时删除重试也失败:', e2);
-            //             throw e2;
-            //         }
-            //     } else {
-            //         throw error;
-            //     }
-            // } else {
-            //     throw error;
-            // }
+        });
+        fs.unlinkSync(tmpFilePath);
+        result = {
+            results: response.data.results,
+            title: finalTitle
         }
+        res.json(result);
     } catch (error) {
         console.error('Error occurred:', error);
         res.status(500).json({error: error.message});