ingest-data.ts 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
  2. import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
  3. import { PineconeStore } from 'langchain/vectorstores/pinecone';
  4. import { pinecone } from '@/utils/pinecone-client';
  5. import { CustomPDFLoader } from '@/utils/customPDFLoader';
  6. import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone';
  7. import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
  8. /* Name of directory to retrieve your files from */
  9. const filePath = 'docs';
  10. export const run = async () => {
  11. try {
  12. /*load raw docs from the all files in the directory */
  13. const directoryLoader = new DirectoryLoader(filePath, {
  14. '.pdf': (path) => new CustomPDFLoader(path),
  15. });
  16. // const loader = new PDFLoader(filePath);
  17. const rawDocs = await directoryLoader.load();
  18. /* Split text into chunks */
  19. const textSplitter = new RecursiveCharacterTextSplitter({
  20. chunkSize: 1000,
  21. chunkOverlap: 200,
  22. });
  23. const docs = await textSplitter.splitDocuments(rawDocs);
  24. console.log('split docs', docs);
  25. console.log('creating vector store...');
  26. /*create and store the embeddings in the vectorStore*/
  27. const embeddings = new OpenAIEmbeddings();
  28. const index = pinecone.Index(PINECONE_INDEX_NAME); //change to your own index name
  29. //embed the PDF documents
  30. await PineconeStore.fromDocuments(docs, embeddings, {
  31. pineconeIndex: index,
  32. namespace: PINECONE_NAME_SPACE,
  33. textKey: 'text',
  34. });
  35. } catch (error) {
  36. console.log('error', error);
  37. throw new Error('Failed to ingest your data');
  38. }
  39. };
  40. (async () => {
  41. await run();
  42. console.log('ingestion complete');
  43. })();