Ingest PDF to Redis from the data/ directory that contains Edgar 10k filings data for Nike.
()
| 8 | |
| 9 | |
| 10 | def ingest_documents(): |
| 11 | """ |
| 12 | Ingest PDF to Redis from the data/ directory that |
| 13 | contains Edgar 10k filings data for Nike. |
| 14 | """ |
| 15 | # Load list of pdfs |
| 16 | company_name = "Nike" |
| 17 | data_path = "data/" |
| 18 | doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0] |
| 19 | |
| 20 | print("Parsing 10k filing doc for NIKE", doc) |
| 21 | |
| 22 | text_splitter = RecursiveCharacterTextSplitter( |
| 23 | chunk_size=1500, chunk_overlap=100, add_start_index=True |
| 24 | ) |
| 25 | loader = UnstructuredFileLoader(doc, mode="single", strategy="fast") |
| 26 | chunks = loader.load_and_split(text_splitter) |
| 27 | |
| 28 | print("Done preprocessing. Created", len(chunks), "chunks of the original pdf") |
| 29 | # Create vectorstore |
| 30 | embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL) |
| 31 | |
| 32 | _ = Redis.from_texts( |
| 33 | # appending this little bit can sometimes help with semantic retrieval |
| 34 | # especially with multiple companies |
| 35 | texts=[f"Company: {company_name}. " + chunk.page_content for chunk in chunks], |
| 36 | metadatas=[chunk.metadata for chunk in chunks], |
| 37 | embedding=embedder, |
| 38 | index_name=INDEX_NAME, |
| 39 | index_schema=INDEX_SCHEMA, |
| 40 | redis_url=REDIS_URL, |
| 41 | ) |
| 42 | |
| 43 | |
| 44 | if __name__ == "__main__": |
no test coverage detected