0

I got a list of summarized PDFs and i want to embed them using a Huggingface model. After that i want to save them in a pinecone database with the namespace of their original document name. I always get some kind of ValueError because the vector is invalid. Any Solutions how I could achieve my goal?

This was my attempt so far:

class Embedding:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(model_name="paraphrase-MiniLM-L6-v2")

    def initialize_pinecone(self):
        pinecone.init(
            api_key="apikey",
            environment="environment",
        )

    def load_documents(self, folder_path):
        document_loader = DirectoryLoader(path=folder_path)
        documents = document_loader.load()
        return documents

    def split_documents(self, documents, chunk_size=1000, chunk_overlap=100):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        split_docs = text_splitter.split_documents(documents)
        return split_docs

    def embed(self):
        print()
        print("-------EMBEDDING-------")
        self.initialize_pinecone()

        summarizedTextsPath = os.path.join(os.path.dirname(__file__), "summarizedTexts")

        summarizedDocs = self.load_documents(summarizedTextsPath)
        summarizedDocsSplit = self.split_documents(summarizedDocs)


        index_name = "lecture-index"

        # Embed and store each document with its filename as namespace
        for i, doc in enumerate(summarizedDocsSplit):
            text = doc.page_content
            vector = self.embeddings.embed_query(text)
            doc_name = os.path.splitext(os.path.basename(doc.metadata["source"]))[0]
            index = pinecone.Index(index_name)
            index.upsert(ids=[i], vectors=vector, namespaces=[doc_name])

        print("Database access successful.")```
desertnaut
  • 57,590
  • 26
  • 140
  • 166

0 Answers0