123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
- import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
- import { PineconeStore } from 'langchain/vectorstores/pinecone';
- import { pinecone } from '@/utils/pinecone-client';
- import { CustomPDFLoader } from '@/utils/customPDFLoader';
- import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone';
- import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
- /* Name of directory to retrieve your files from */
- const filePath = 'docs';
- export const run = async () => {
- try {
- /*load raw docs from the all files in the directory */
- const directoryLoader = new DirectoryLoader(filePath, {
- '.pdf': (path) => new CustomPDFLoader(path),
- });
- // const loader = new PDFLoader(filePath);
- const rawDocs = await directoryLoader.load();
- /* Split text into chunks */
- const textSplitter = new RecursiveCharacterTextSplitter({
- chunkSize: 1000,
- chunkOverlap: 200,
- });
- const docs = await textSplitter.splitDocuments(rawDocs);
- console.log('split docs', docs);
- console.log('creating vector store...');
- /*create and store the embeddings in the vectorStore*/
- const embeddings = new OpenAIEmbeddings();
- const index = pinecone.Index(PINECONE_INDEX_NAME); //change to your own index name
- //embed the PDF documents
- await PineconeStore.fromDocuments(docs, embeddings, {
- pineconeIndex: index,
- namespace: PINECONE_NAME_SPACE,
- textKey: 'text',
- });
- } catch (error) {
- console.log('error', error);
- throw new Error('Failed to ingest your data');
- }
- };
- (async () => {
- await run();
- console.log('ingestion complete');
- })();
|