I got a list of summarized PDFs and i want to embed them using a Huggingface model. After that i want to save them in a pinecone database with the namespace of their original document name. I always get some kind of ValueError because the vector is invalid. Any Solutions how I could achieve my goal?
This was my attempt so far:
class Embedding:
def __init__(self):
self.embeddings = HuggingFaceEmbeddings(model_name="paraphrase-MiniLM-L6-v2")
def initialize_pinecone(self):
pinecone.init(
api_key="apikey",
environment="environment",
)
def load_documents(self, folder_path):
document_loader = DirectoryLoader(path=folder_path)
documents = document_loader.load()
return documents
def split_documents(self, documents, chunk_size=1000, chunk_overlap=100):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
split_docs = text_splitter.split_documents(documents)
return split_docs
def embed(self):
print()
print("-------EMBEDDING-------")
self.initialize_pinecone()
summarizedTextsPath = os.path.join(os.path.dirname(__file__), "summarizedTexts")
summarizedDocs = self.load_documents(summarizedTextsPath)
summarizedDocsSplit = self.split_documents(summarizedDocs)
index_name = "lecture-index"
# Embed and store each document with its filename as namespace
for i, doc in enumerate(summarizedDocsSplit):
text = doc.page_content
vector = self.embeddings.embed_query(text)
doc_name = os.path.splitext(os.path.basename(doc.metadata["source"]))[0]
index = pinecone.Index(index_name)
index.upsert(ids=[i], vectors=vector, namespaces=[doc_name])
print("Database access successful.")```