|
@@ -23,6 +23,8 @@ const https = require("https");
|
|
|
const { Pool } = require('pg');
|
|
|
const axios = require('axios');
|
|
|
const multer = require("multer");
|
|
|
+const {chromium} = require('playwright');
|
|
|
+const TurndownService = require('turndown');
|
|
|
|
|
|
const loginHandler = require('./login'); // 确保路径正确
|
|
|
const { r2rClient } = require("r2r-js");
|
|
@@ -104,7 +106,6 @@ router.route("/extract").all(async (req, res, next) => {
|
|
|
});
|
|
|
|
|
|
|
|
|
-
|
|
|
router.route("/selectUser").all((req, res, next) => {
|
|
|
var json = queryString(req.url);
|
|
|
getmysql(req, res, "selectUser", json["userid"]);
|
|
@@ -1302,6 +1303,140 @@ postmysqlOnline = function (req, res) {
|
|
|
}
|
|
|
};
|
|
|
|
|
|
+// 新增测试接口,直接返回成功
|
|
|
+router.post('/knowledge-base/html-to-markdown', upload.none(), async (req, res) => {
|
|
|
+ // console.log('收到的 req.body:', req.body);
|
|
|
+ // console.log('收到的 req.files:', req.files);
|
|
|
+ // console.log('收到的 headers:', req.headers);
|
|
|
+ const url = req.body.url; // 前端传入 url
|
|
|
+ if (!url) {
|
|
|
+ return res.status(400).json({error: 'Missing url'});
|
|
|
+ }
|
|
|
+ try {
|
|
|
+ const browser = await chromium.launch({headless: true});
|
|
|
+ const page = await browser.newPage();
|
|
|
+ await page.goto(url, {waitUntil: 'networkidle'});
|
|
|
+ const html = await page.content();
|
|
|
+ await browser.close();
|
|
|
+ // 新增:HTML 转 Markdown
|
|
|
+ const turndownService = new TurndownService();
|
|
|
+ const markdownContent = turndownService.turndown(html);
|
|
|
+ // 新增:解析 <title> 标签内容
|
|
|
+ let title = '';
|
|
|
+ const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i);
|
|
|
+ if (titleMatch && titleMatch[1]) {
|
|
|
+ title = titleMatch[1].trim();
|
|
|
+ }
|
|
|
+
|
|
|
+ console.log("抓取到的内容长度:", html.length, "markdown的内容长度:", markdownContent.length);
|
|
|
+
|
|
|
+ // 解析用户传入的参数
|
|
|
+ const {
|
|
|
+ collection_ids,
|
|
|
+ id: customId,
|
|
|
+ metadata: customMetadata,
|
|
|
+ ingestion_mode,
|
|
|
+ run_with_orchestration
|
|
|
+ } = req.body;
|
|
|
+
|
|
|
+ // 处理 metadata 字段
|
|
|
+ let finalMetadata = customMetadata;
|
|
|
+ if (typeof finalMetadata === 'string') {
|
|
|
+ try {
|
|
|
+ finalMetadata = JSON.parse(finalMetadata);
|
|
|
+ } catch (e) {
|
|
|
+ return res.status(400).json({error: 'metadata 不是合法的 JSON 字符串'});
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (!finalMetadata) {
|
|
|
+ finalMetadata = {
|
|
|
+ title: title || "抓取的网页内容",
|
|
|
+ source_url: url,
|
|
|
+ content_type: "markdown",
|
|
|
+ scraped_at: new Date().toISOString(),
|
|
|
+ upload_id: Date.now() + '-' + Math.random()
|
|
|
+ };
|
|
|
+ }
|
|
|
+ // 处理 collection_ids 字段
|
|
|
+ let finalCollectionIds = collection_ids;
|
|
|
+ if (typeof finalCollectionIds === 'string') {
|
|
|
+ try {
|
|
|
+ finalCollectionIds = JSON.parse(finalCollectionIds);
|
|
|
+ } catch (e) {
|
|
|
+ // 不是合法 JSON 字符串就当作普通字符串
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 如果 metadata 里没有 collection_ids,但用户传了 collection_ids,则加进去
|
|
|
+ if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
|
|
|
+ finalMetadata.collection_ids = finalCollectionIds;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 组装 create 参数
|
|
|
+ const createParams = {
|
|
|
+ raw_text: markdownContent,
|
|
|
+ metadata: finalMetadata
|
|
|
+ };
|
|
|
+ if (finalCollectionIds && Array.isArray(finalCollectionIds)) {
|
|
|
+ createParams.collection_ids = finalCollectionIds;
|
|
|
+ }
|
|
|
+ if (customId) {
|
|
|
+ createParams.id = customId;
|
|
|
+ }
|
|
|
+ if (ingestion_mode) {
|
|
|
+ createParams.ingestion_mode = ingestion_mode;
|
|
|
+ } else {
|
|
|
+ createParams.ingestionMode = "fast";
|
|
|
+ }
|
|
|
+ if (typeof run_with_orchestration !== 'undefined') {
|
|
|
+ createParams.run_with_orchestration = run_with_orchestration;
|
|
|
+ }
|
|
|
|
|
|
+ // 4. 将 markdown 内容上传到 R2R
|
|
|
+ console.log("正在上传文档...");
|
|
|
+ try {
|
|
|
+ const documentResponse = await client.documents.create(createParams);
|
|
|
+ console.log("文档上传成功!");
|
|
|
+ console.log("文档ID:", documentResponse.results.documentId);
|
|
|
+ result = {
|
|
|
+ results: documentResponse.results,
|
|
|
+ title: title
|
|
|
+ }
|
|
|
+ res.json(result);
|
|
|
+ } catch (error) {
|
|
|
+ console.error('Error occurred:', error);
|
|
|
+ res.status(500).json({error: error.message});
|
|
|
+ // 如果 409 冲突,尝试删除后重试
|
|
|
+ // if (error.message && error.message.includes('409')) {
|
|
|
+ // // 从错误信息中提取 documentId
|
|
|
+ // const match = error.message.match(/Document ([\w-]+) is currently ingesting/);
|
|
|
+ // const conflictDocId = match && match[1];
|
|
|
+ // if (conflictDocId) {
|
|
|
+ // try {
|
|
|
+ // await client.documents.delete({id: conflictDocId});
|
|
|
+ // console.log('冲突时已删除重复文档:', conflictDocId);
|
|
|
+ // // 再次尝试上传
|
|
|
+ // const documentResponse = await client.documents.create(createParams);
|
|
|
+ // result = {
|
|
|
+ // results: documentResponse.results,
|
|
|
+ // title: title
|
|
|
+ // }
|
|
|
+ // res.json(result);
|
|
|
+ // return;
|
|
|
+ // } catch (e2) {
|
|
|
+ // console.error('冲突时删除重试也失败:', e2);
|
|
|
+ // throw e2;
|
|
|
+ // }
|
|
|
+ // } else {
|
|
|
+ // throw error;
|
|
|
+ // }
|
|
|
+ // } else {
|
|
|
+ // throw error;
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ } catch (error) {
|
|
|
+ console.error('Error occurred:', error);
|
|
|
+ res.status(500).json({error: error.message});
|
|
|
+ }
|
|
|
+});
|
|
|
|
|
|
module.exports = router;
|