Ingest PDF to Redis from the data/ directory that contains Edgar 10k filings data for Nike.
()
| 8 | |
| 9 | |
| 10 | def ingest_documents(): |
| 11 | """ |
| 12 | Ingest PDF to Redis from the data/ directory that |
| 13 | contains Edgar 10k filings data for Nike. |
| 14 | """ |
| 15 | # Load list of pdfs |
| 16 | data_path = "data/" |
| 17 | doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0] |
| 18 | |
| 19 | print("Parsing 10k filing doc for NIKE", doc) |
| 20 | |
| 21 | text_splitter = RecursiveCharacterTextSplitter( |
| 22 | chunk_size=1500, chunk_overlap=100, add_start_index=True |
| 23 | ) |
| 24 | loader = UnstructuredFileLoader(doc, mode="single", strategy="fast") |
| 25 | chunks = loader.load_and_split(text_splitter) |
| 26 | |
| 27 | print("Done preprocessing. Created", len(chunks), "chunks of the original pdf") |
| 28 | |
| 29 | # Create vectorstore |
| 30 | embedder = HuggingFaceEmbeddings( |
| 31 | model_name="sentence-transformers/all-MiniLM-L6-v2" |
| 32 | ) |
| 33 | |
| 34 | documents = [] |
| 35 | for chunk in chunks: |
| 36 | doc = Document(page_content=chunk.page_content, metadata=chunk.metadata) |
| 37 | documents.append(doc) |
| 38 | |
| 39 | # Add to vectorDB |
| 40 | _ = Chroma.from_documents( |
| 41 | documents=documents, |
| 42 | collection_name="xeon-rag", |
| 43 | embedding=embedder, |
| 44 | persist_directory="/tmp/xeon_rag_db", |
| 45 | ) |
| 46 | |
| 47 | |
| 48 | if __name__ == "__main__": |
no test coverage detected