customPDFLoader.ts 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import { Document } from 'langchain/document';
  2. import { readFile } from 'fs/promises';
  3. import { BaseDocumentLoader } from 'langchain/document_loaders';
  4. export abstract class BufferLoader extends BaseDocumentLoader {
  5. constructor(public filePathOrBlob: string | Blob) {
  6. super();
  7. }
  8. protected abstract parse(
  9. raw: Buffer,
  10. metadata: Document['metadata'],
  11. ): Promise<Document[]>;
  12. public async load(): Promise<Document[]> {
  13. let buffer: Buffer;
  14. let metadata: Record<string, string>;
  15. if (typeof this.filePathOrBlob === 'string') {
  16. buffer = await readFile(this.filePathOrBlob);
  17. metadata = { source: this.filePathOrBlob };
  18. } else {
  19. buffer = await this.filePathOrBlob
  20. .arrayBuffer()
  21. .then((ab) => Buffer.from(ab));
  22. metadata = { source: 'blob', blobType: this.filePathOrBlob.type };
  23. }
  24. return this.parse(buffer, metadata);
  25. }
  26. }
  27. export class CustomPDFLoader extends BufferLoader {
  28. public async parse(
  29. raw: Buffer,
  30. metadata: Document['metadata'],
  31. ): Promise<Document[]> {
  32. const { pdf } = await PDFLoaderImports();
  33. const parsed = await pdf(raw);
  34. return [
  35. new Document({
  36. pageContent: parsed.text,
  37. metadata: {
  38. ...metadata,
  39. pdf_numpages: parsed.numpages,
  40. },
  41. }),
  42. ];
  43. }
  44. }
  45. async function PDFLoaderImports() {
  46. try {
  47. // the main entrypoint has some debug code that we don't want to import
  48. const { default: pdf } = await import('pdf-parse/lib/pdf-parse.js');
  49. return { pdf };
  50. } catch (e) {
  51. console.error(e);
  52. throw new Error(
  53. 'Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`.',
  54. );
  55. }
  56. }