فهرست منبع

improve upload exp;

jimmylee 1 هفته پیش
والد
کامیت
2cd166044e
1فایلهای تغییر یافته به همراه104 افزوده شده و 91 حذف شده
  1. 104 91
      pbl.js

+ 104 - 91
pbl.js

@@ -53,6 +53,17 @@ const upload = multer({
     }
 });
 
+// 全局浏览器实例
+let globalBrowser = null;
+
+async function initBrowser() {
+    if (!globalBrowser) {
+        globalBrowser = await chromium.launch({headless: true});
+        console.log('Browser initialized');
+    }
+    return globalBrowser;
+}
+
 router.route("/login").all(async (req, res, next) => {
     loginHandler(req, res)
 });
@@ -1323,54 +1334,53 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
     if (!requestUrl) {
         return res.status(400).json({error: 'Missing url'});
     }
+    const browser = await initBrowser();
+    const page = await browser.newPage();
     try {
-        const browser = await chromium.launch({headless: true});
-        const page = await browser.newPage();
-        await page.goto(requestUrl, {waitUntil: 'networkidle'});
-        const html = await page.content();
-        await browser.close();
-        // 新增:HTML 转 Markdown
-        const turndownService = new TurndownService();
-        // 用 cheerio 只提取正文部分
-        const cheerioDoc = cheerio.load(html);
-        let mainHtml = cheerioDoc('main').html() || cheerioDoc('article').html() || cheerioDoc('#content').html() || cheerioDoc('.content').html() || cheerioDoc('.post').html() || cheerioDoc('body').html();
-        // 用 cheerio 过滤无关标签
-        const cheerioMain = cheerio.load(mainHtml || '');
-        cheerioMain('style, script, noscript, iframe, link, header, footer, nav, form, .ads, .advertisement').remove();
-        mainHtml = cheerioMain.html();
-        const markdownContent = turndownService.turndown(mainHtml);
-        // 新增:解析 <title> 标签内容
-        let title = '';
-        const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
-        if (titleMatch && titleMatch[1]) {
-            title = titleMatch[1].trim();
-        }
-        let finalTitle;
-        if (title && title.trim()) {
-            finalTitle = title + '.md';
-        } else {
-            finalTitle = 'untitled-' + uuidv4() + '.md';
-        }
-        console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
-
-        // 解析用户传入的参数
-        const {
-            collection_ids,
-            id: customId,
-            metadata: customMetadata,
-            ingestion_mode,
-            run_with_orchestration
-        } = req.body;
-
-        // 处理 metadata 字段
-        let finalMetadata = customMetadata;
-        if (typeof finalMetadata === 'string') {
-            try {
-                finalMetadata = JSON.parse(finalMetadata);
-            } catch (e) {
-                return res.status(400).json({error: 'metadata 不是合法的 JSON 字符串'});
+            console.log("开始抓取页面:", requestUrl);
+            // await page.goto(requestUrl, {waitUntil: 'networkidle'});
+            await page.goto(requestUrl);
+            const html = await page.content();
+            // 新增:HTML 转 Markdown
+            const turndownService = new TurndownService();
+            // 用 cheerio 只提取正文部分
+            const cheerioDoc = cheerio.load(html);
+            let mainHtml = cheerioDoc('main').html() || cheerioDoc('article').html() || cheerioDoc('#content').html() || cheerioDoc('.content').html() || cheerioDoc('.post').html() || cheerioDoc('body').html();
+            // 用 cheerio 过滤无关标签
+            const cheerioMain = cheerio.load(mainHtml || '');
+            cheerioMain('style, script, noscript, iframe, link, header, footer, nav, form, .ads, .advertisement').remove();
+            mainHtml = cheerioMain.html();
+            const markdownContent = turndownService.turndown(mainHtml);
+            // 新增:解析 <title> 标签内容
+            let title = '';
+            const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
+            if (titleMatch && titleMatch[1]) {
+                title = titleMatch[1].trim();
+            }
+            let finalTitle;
+            if (title && title.trim()) {
+                finalTitle = title + '.md';
+            } else {
+                finalTitle = 'untitled-' + uuidv4() + '.md';
+            }
+            console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
+            // 解析用户传入的参数
+            const {
+                collection_ids,
+                id: customId,
+                metadata: customMetadata,
+                ingestion_mode,
+                run_with_orchestration
+            } = req.body;
+            // 处理 metadata 字段
+            let finalMetadata = customMetadata;
+            if (typeof finalMetadata === 'string') {
+                try {
+                    finalMetadata = JSON.parse(finalMetadata);
+                } catch (e) {
+                    return res.status(400).json({ error: 'metadata 不是合法的 JSON 字符串' });
+                }
             }
-        }
         // if (!finalMetadata) {
         //     finalMetadata = {
         //         title: title || uuidv4(),
@@ -1380,62 +1390,65 @@ router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res)
         //         // upload_id: Date.now() + '-' + Math.random()
         //     };
         // }
-        // 处理 collection_ids 字段
-        let finalCollectionIds = collection_ids;
-        if (typeof finalCollectionIds === 'string') {
-            try {
-                finalCollectionIds = JSON.parse(finalCollectionIds);
-            } catch (e) {
-                // 不是合法 JSON 字符串就当作普通字符串
+            // 处理 collection_ids 字段
+            let finalCollectionIds = collection_ids;
+            if (typeof finalCollectionIds === 'string') {
+                try {
+                    finalCollectionIds = JSON.parse(finalCollectionIds);
+                } catch (e) {
+                    // 不是合法 JSON 字符串就当作普通字符串
+                }
+            }
+            // 如果 metadata 里没有 collection_ids,但用户传了 collection_ids,则加进去
+            if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
+                finalMetadata.collection_ids = finalCollectionIds;
             }
-        }
-        // 如果 metadata 里没有 collection_ids,但用户传了 collection_ids,则加进去
-        if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
-            finalMetadata.collection_ids = finalCollectionIds;
-        }
 
-        // 补全 finalMetadata.title 和 content_type
-        if (!finalMetadata.title) {
+            // 补全 finalMetadata.title 和 content_type
+            if (!finalMetadata.title) {
             finalMetadata.title = finalTitle;
-        }
-        finalMetadata.content_type = 'markdown';
+            }
+            finalMetadata.content_type = 'markdown';
 
-        // 用 axios 发送 multipart/form-data 方式上传到 r2r
-        const FormData = require('form-data');
-        const fs = require('fs');
-        const path = require('path');
+            // 用 axios 发送 multipart/form-data 方式上传到 r2r
+            const FormData = require('form-data');
+            const fs = require('fs');
+            const path = require('path');
         const tmpFilePath = path.join(__dirname, `${title || 'untitled'}-${uuidv4()}.md`);
-        fs.writeFileSync(tmpFilePath, markdownContent, 'utf8');
-        const form = new FormData();
+            fs.writeFileSync(tmpFilePath, markdownContent, 'utf8');
+            const form = new FormData();
         form.append('file', fs.createReadStream(tmpFilePath), finalTitle);
-        if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
-            form.append('collection_ids', JSON.stringify(finalCollectionIds));
-        }
-        if (customId) {
-            form.append('id', customId);
-        }
-        form.append('metadata', JSON.stringify(finalMetadata));
-        form.append('ingestion_mode', ingestion_mode || 'fast');
-        form.append('run_with_orchestration', 'false');
-        // 打印上传参数,方便排查问题
+            if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
+                form.append('collection_ids', JSON.stringify(finalCollectionIds));
+            }
+            if (customId) {
+                form.append('id', customId);
+            }
+            form.append('metadata', JSON.stringify(finalMetadata));
+            form.append('ingestion_mode', ingestion_mode || 'fast');
+            form.append('run_with_orchestration', 'false');
+            // 打印上传参数,方便排查问题
         console.log('Request Headers:', form.getHeaders());
-        const url = 'https://r2rserver.cocorobo.cn/v3/documents';
-        const response = await axios.post(url, form, {
-            headers: {
-                ...form.getHeaders(),
+            const url = 'https://r2rserver.cocorobo.cn/v3/documents';
+            const response = await axios.post(url, form, {
+                headers: {
+                    ...form.getHeaders(),
                 // 'User-Agent': 'NodeClient/1.0'
-            }
-        });
-        fs.unlinkSync(tmpFilePath);
-        result = {
+                }
+            });
+            fs.unlinkSync(tmpFilePath);
+            result = {
             results: response.data.results,
             title: finalTitle
+            }
+            res.json(result);
+        } catch (error) {
+            console.error('Message:', error.message);
+            console.error('Stack:', error.stack);
+            res.status(500).json({error: error.message});
+        } finally {
+            await page.close(); // 确保页面总是被关闭
         }
-        res.json(result);
-    } catch (error) {
-        console.error('Error occurred:', error);
-        res.status(500).json({error: error.message});
-    }
 });
 
 module.exports = router;