浏览代码

Merge remote-tracking branch 'origin/feat-web2md-import'

lsc 2 天之前
父节点
当前提交
6d7608083f
共有 3 个文件被更改,包括 208 次插入3 次删除
  1. 69 1
      package-lock.json
  2. 3 1
      package.json
  3. 136 1
      pbl.js

+ 69 - 1
package-lock.json

@@ -4576,6 +4576,11 @@
         "@jridgewell/sourcemap-codec": "^1.4.14"
       }
     },
+    "@mixmark-io/domino": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz",
+      "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="
+    },
     "@rrweb/types": {
       "version": "2.0.0-alpha.17",
       "resolved": "https://registry.npmmirror.com/@rrweb/types/-/types-2.0.0-alpha.17.tgz",
@@ -5357,6 +5362,12 @@
       "resolved": "https://registry.npmmirror.com/fs.realpath/-/fs.realpath-1.0.0.tgz",
       "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw=="
     },
+    "fsevents": {
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+      "optional": true
+    },
     "function-bind": {
       "version": "1.1.2",
       "resolved": "https://registry.npmmirror.com/function-bind/-/function-bind-1.1.2.tgz",
@@ -6180,6 +6191,20 @@
       "resolved": "https://registry.npmmirror.com/pirates/-/pirates-4.0.6.tgz",
       "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg=="
     },
+    "playwright": {
+      "version": "1.53.0",
+      "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.53.0.tgz",
+      "integrity": "sha512-ghGNnIEYZC4E+YtclRn4/p6oYbdPiASELBIYkBXfaTVKreQUYbMUYQDwS12a8F0/HtIjr/CkGjtwABeFPGcS4Q==",
+      "requires": {
+        "fsevents": "2.3.2",
+        "playwright-core": "1.53.0"
+      }
+    },
+    "playwright-core": {
+      "version": "1.53.0",
+      "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.53.0.tgz",
+      "integrity": "sha512-mGLg8m0pm4+mmtB7M89Xw/GSqoNC+twivl8ITteqvAndachozYe2ZA7srU6uleV1vEdAHYqjq+SV8SNxRRFYBw=="
+    },
     "postcss": {
       "version": "8.5.1",
       "resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.5.1.tgz",
@@ -6314,13 +6339,15 @@
         "mysql": "^2.18.1",
         "nodemon": "^3.1.9",
         "pg": "^8.13.1",
+        "playwright": "^1.53.0",
         "prettier": "^3.3.3",
         "querystring": "^0.2.1",
         "r2r-js": "^0.4.34",
         "r2r-webdev-template": "file:",
         "react": "^18",
         "react-dom": "^18",
-        "request": "^2.88.2"
+        "request": "^2.88.2",
+        "turndown": "^7.2.0"
       },
       "dependencies": {
         "@ampproject/remapping": {
@@ -6784,6 +6811,11 @@
             "@jridgewell/sourcemap-codec": "^1.4.14"
           }
         },
+        "@mixmark-io/domino": {
+          "version": "2.2.0",
+          "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz",
+          "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="
+        },
         "@rrweb/types": {
           "version": "2.0.0-alpha.17",
           "resolved": "https://registry.npmmirror.com/@rrweb/types/-/types-2.0.0-alpha.17.tgz",
@@ -7565,6 +7597,12 @@
           "resolved": "https://registry.npmmirror.com/fs.realpath/-/fs.realpath-1.0.0.tgz",
           "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw=="
         },
+        "fsevents": {
+          "version": "2.3.2",
+          "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+          "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+          "optional": true
+        },
         "function-bind": {
           "version": "1.1.2",
           "resolved": "https://registry.npmmirror.com/function-bind/-/function-bind-1.1.2.tgz",
@@ -8388,6 +8426,20 @@
           "resolved": "https://registry.npmmirror.com/pirates/-/pirates-4.0.6.tgz",
           "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg=="
         },
+        "playwright": {
+          "version": "1.53.0",
+          "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.53.0.tgz",
+          "integrity": "sha512-ghGNnIEYZC4E+YtclRn4/p6oYbdPiASELBIYkBXfaTVKreQUYbMUYQDwS12a8F0/HtIjr/CkGjtwABeFPGcS4Q==",
+          "requires": {
+            "fsevents": "2.3.2",
+            "playwright-core": "1.53.0"
+          }
+        },
+        "playwright-core": {
+          "version": "1.53.0",
+          "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.53.0.tgz",
+          "integrity": "sha512-mGLg8m0pm4+mmtB7M89Xw/GSqoNC+twivl8ITteqvAndachozYe2ZA7srU6uleV1vEdAHYqjq+SV8SNxRRFYBw=="
+        },
         "postcss": {
           "version": "8.5.1",
           "resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.5.1.tgz",
@@ -8912,6 +8964,14 @@
             }
           }
         },
+        "turndown": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.0.tgz",
+          "integrity": "sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==",
+          "requires": {
+            "@mixmark-io/domino": "^2.2.0"
+          }
+        },
         "tweetnacl": {
           "version": "0.14.5",
           "resolved": "https://registry.npmmirror.com/tweetnacl/-/tweetnacl-0.14.5.tgz",
@@ -9449,6 +9509,14 @@
         }
       }
     },
+    "turndown": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.0.tgz",
+      "integrity": "sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==",
+      "requires": {
+        "@mixmark-io/domino": "^2.2.0"
+      }
+    },
     "tweetnacl": {
       "version": "0.14.5",
       "resolved": "https://registry.npmmirror.com/tweetnacl/-/tweetnacl-0.14.5.tgz",

+ 3 - 1
package.json

@@ -21,12 +21,14 @@
     "multer": "^1.4.5-lts.1",
     "mysql": "^2.18.1",
     "pg": "^8.13.1",
+    "playwright": "^1.53.0",
     "querystring": "^0.2.1",
     "r2r-js": "^0.4.34",
     "r2r-webdev-template": "file:",
     "react": "^18",
     "react-dom": "^18",
-    "request": "^2.88.2"
+    "request": "^2.88.2",
+    "turndown": "^7.2.0"
   },
   "devDependencies": {
     "nodemon": "^3.1.9",

+ 136 - 1
pbl.js

@@ -23,6 +23,8 @@ const https = require("https");
 const { Pool } = require('pg');
 const axios = require('axios');
 const multer = require("multer");
+const {chromium} = require('playwright');
+const TurndownService = require('turndown');
 
 const loginHandler = require('./login'); // 确保路径正确
 const { r2rClient } = require("r2r-js");
@@ -104,7 +106,6 @@ router.route("/extract").all(async (req, res, next) => {
 });
 
 
-
 router.route("/selectUser").all((req, res, next) => {
     var json = queryString(req.url);
     getmysql(req, res, "selectUser", json["userid"]);
@@ -1302,6 +1303,140 @@ postmysqlOnline = function (req, res) {
     }
 };
 
+// 新增测试接口,直接返回成功
+router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res) => {
+    // console.log('收到的 req.body:', req.body);
+    // console.log('收到的 req.files:', req.files);
+    // console.log('收到的 headers:', req.headers);
+    const url = req.body.url; // 前端传入 url
+    if (!url) {
+        return res.status(400).json({error: 'Missing url'});
+    }
+    try {
+        const browser = await chromium.launch({headless: true});
+        const page = await browser.newPage();
+        await page.goto(url, {waitUntil: 'networkidle'});
+        const html = await page.content();
+        await browser.close();
+        // 新增:HTML 转 Markdown
+        const turndownService = new TurndownService();
+        const markdownContent = turndownService.turndown(html);
+        // 新增:解析 <title> 标签内容
+        let title = '';
+        const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
+        if (titleMatch && titleMatch[1]) {
+            title = titleMatch[1].trim();
+        }
+
+        console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
+
+        // 解析用户传入的参数
+        const {
+            collection_ids,
+            id: customId,
+            metadata: customMetadata,
+            ingestion_mode,
+            run_with_orchestration
+        } = req.body;
+
+        // 处理 metadata 字段
+        let finalMetadata = customMetadata;
+        if (typeof finalMetadata === 'string') {
+            try {
+                finalMetadata = JSON.parse(finalMetadata);
+            } catch (e) {
+                return res.status(400).json({error: 'metadata 不是合法的 JSON 字符串'});
+            }
+        }
+        if (!finalMetadata) {
+            finalMetadata = {
+                title: title || "抓取的网页内容",
+                source_url: url,
+                content_type: "markdown",
+                scraped_at: new Date().toISOString(),
+                upload_id: Date.now() + '-' + Math.random()
+            };
+        }
+        // 处理 collection_ids 字段
+        let finalCollectionIds = collection_ids;
+        if (typeof finalCollectionIds === 'string') {
+            try {
+                finalCollectionIds = JSON.parse(finalCollectionIds);
+            } catch (e) {
+                // 不是合法 JSON 字符串就当作普通字符串
+            }
+        }
+        // 如果 metadata 里没有 collection_ids,但用户传了 collection_ids,则加进去
+        if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
+            finalMetadata.collection_ids = finalCollectionIds;
+        }
+
+        // 组装 create 参数
+        const createParams = {
+            raw_text: markdownContent,
+            metadata: finalMetadata
+        };
+        if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
+            createParams.collection_ids = finalCollectionIds;
+        }
+        if (customId) {
+            createParams.id = customId;
+        }
+        if (ingestion_mode) {
+            createParams.ingestion_mode = ingestion_mode;
+        } else {
+            createParams.ingestionMode = "fast";
+        }
+        if (typeof run_with_orchestration !== 'undefined') {
+            createParams.run_with_orchestration = run_with_orchestration;
+        }
 
+        // 4. 将 markdown 内容上传到 R2R
+        console.log("正在上传文档...");
+        try {
+            const documentResponse = await client.documents.create(createParams);
+            console.log("文档上传成功!");
+            console.log("文档ID:", documentResponse.results.documentId);
+            result = {
+                results: documentResponse.results,
+                title: title
+            }
+            res.json(result);
+        } catch (error) {
+            console.error('Error occurred:', error);
+            res.status(500).json({error: error.message});
+            // 如果 409 冲突,尝试删除后重试
+            // if (error.message && error.message.includes('409')) {
+            //     // 从错误信息中提取 documentId
+            //     const match = error.message.match(/Document ([\w-]+) is currently ingesting/);
+            //     const conflictDocId = match && match[1];
+            //     if (conflictDocId) {
+            //         try {
+            //             await client.documents.delete({id: conflictDocId});
+            //             console.log('冲突时已删除重复文档:', conflictDocId);
+            //             // 再次尝试上传
+            //             const documentResponse = await client.documents.create(createParams);
+            //             result = {
+            //                 results: documentResponse.results,
+            //                 title: title
+            //             }
+            //             res.json(result);
+            //             return;
+            //         } catch (e2) {
+            //             console.error('冲突时删除重试也失败:', e2);
+            //             throw e2;
+            //         }
+            //     } else {
+            //         throw error;
+            //     }
+            // } else {
+            //     throw error;
+            // }
+        }
+    } catch (error) {
+        console.error('Error occurred:', error);
+        res.status(500).json({error: error.message});
+    }
+});
 
 module.exports = router;