# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)

* vectorize text, html files, pdfs and docx into one vector DB, split in tables (sqlite vss)
* use local self-hosted embeddings (CPU computed)
  * for sentences 
* query a local sqlite vss vector db, use cache from LangChain (sqlite)
* use OpenAI API and (Ollama op-prem self-hosted) Mistral for the response processing
* compare with LLMware Bling 

Due to cost reasons the OpenAI embeddings don't get used. So sorry :p 

## Dependencies

* Cryptography is used to handle some PDF functions here (signatures)

In [1]:
%pip show cryptography

Name: cryptography
Version: 42.0.5
Summary: cryptography is a package which provides cryptographic recipes and primitives to Python developers.
Home-page: 
Author: 
Author-email: The Python Cryptographic Authority and individual contributors <cryptography-dev@python.org>
License: Apache-2.0 OR BSD-3-Clause
Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages
Requires: cffi
Required-by: 
Note: you may need to restart the kernel to use updated packages.


* pikepdf is used to repair some PDFs

In [3]:
%pip show pikepdf

Name: pikepdf
Version: 8.13.0
Summary: Read and write PDFs with Python, powered by qpdf
Home-page: 
Author: 
Author-email: "James R. Barlow" <james@purplerock.ca>
License: MPL-2.0
Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages
Requires: Deprecated, lxml, packaging, Pillow
Required-by: 
Note: you may need to restart the kernel to use updated packages.


* pypdf with all features is needed because this DB consists of 100+ PDFs 

In [5]:
%pip show "pypdf"

Name: pypdf
Version: 4.0.2
Summary: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
Home-page: 
Author: 
Author-email: Mathieu Fenniak <biziqe@mathieu.fenniak.net>
License: 
Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.


## Text extraction

* Here the html and text data is extracted into one txt
* The PDF and DOCX data is extracted into another txt
* the texts are normalized
* different encodings get handled
* difficult files get repaired
* exceptions get handled (UTF-16 issues, PDF reference errors)

In [40]:
import glob
import os
from concurrent.futures import ThreadPoolExecutor
import unicodedata  # to normalize text
import html2text  # to convert html to text
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
import pikepdf  # to repair PDFs
from pathlib import Path
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# todo: hardcoded paths
output_path = "/home/marius/source/bookworm/export.txt"

def convert_html_to_text(html_blob: str) -> str:
    """
    Converts a html blob into a string.
    """
    h = html2text.HTML2Text()
    h.mark_code = True
    h.escape_snob = True
    h.unicode_snob = True
    # h.use_automatic_links = True 
    h.images_as_html = True
    h.single_line_break = True
    h.ignore_links = True
    return h.handle(html_blob)

def normalize_text(txt_blob: str) -> str:
    """
    Normalize a text blob using NFKD normalization.
    """
    return unicodedata.normalize("NFKD", txt_blob)

def repair_pdf(file_path: str) -> bool:
    """
    Attempts to repair a PDF file using pikepdf.
    """
    try:
        with pikepdf.open(file_path, allow_overwriting_input=True) as pdf:
            pdf.save(file_path)
        return True
    except pikepdf.PdfError as e:
        print(f"Failed to repair PDF {file_path}: {e}")
        return False

def read_and_convert_file(file_path: str, is_html: bool, is_pdf: bool, is_docx: bool) -> str:
    """
    Reads and converts a file from HTML, PDF, DOCX, or plain text to text.
    """
    content = ""
    if is_html:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            return convert_html_to_text(content)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return ""

    elif is_pdf:
        try:
            loader = PyPDFLoader(file_path)
            # ... fixes "Multiple definitions in dictionary at byte 0xb32 for key /ExtGState" error
            documents = loader.load()
            content = "\n".join(doc.page_content for doc in documents if hasattr(doc, 'page_content'))
        except Exception as e:
            print(f"Error loading PDF {file_path}: {e}. Attempting to repair...")
            if repair_pdf(file_path):
                try:
                    loader = PyPDFLoader(file_path)
                    documents = loader.load()
                    content = "\n".join(doc.page_content for doc in documents if hasattr(doc, 'page_content'))
                except Exception as e:
                    print(f"Failed to process PDF {file_path} after repair: {e}")
                    return ""
        return normalize_text(content)

    elif is_docx:
        try:
            loader = Docx2txtLoader(file_path)
            content = loader.load()
            if isinstance(content, list):
                content = "\n".join(content)
        except Exception as e:
            print(f"Error reading DOCX {file_path}: {e}")
            return ""
        return normalize_text(content)

    else:  # For plain text files
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return normalize_text(file.read())
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return ""

def sanitize_text(text):
    """
    Removes or replaces surrogate characters from a string.
    """
    return text.encode('utf-8', 'replace').decode('utf-8')

def append_to_output(data: str, is_pdf: bool, is_docx: bool, output_path: str):
    """
    Appends sanitized data to an output file.
    """
    sanitized_data = sanitize_text(data)
    if is_pdf or is_docx:
        output_path = str(Path(output_path).with_suffix('')) + ".documents.txt"
    
    with open(output_path, "a", encoding='utf-8') as output_file:
        output_file.write(sanitized_data)

def process_file(file):
    is_html = file.endswith('.html')
    is_pdf = file.endswith('.pdf')
    is_docx = file.endswith('.docx')
    
    file_content = read_and_convert_file(file, is_html, is_pdf, is_docx)
    append_to_output(file_content, is_pdf, is_docx, output_path=output_path)

def process_files_in_directory(directory: str):
    txt_html_files = glob.glob(os.path.join(directory, "*.txt")) + glob.glob(os.path.join(directory, "*.html"))
    pdf_docx_files = glob.glob(os.path.join(directory, "img", "*.pdf")) + glob.glob(os.path.join(directory, "img", "*.docx"))
    all_files = txt_html_files + pdf_docx_files

    # Initialize the progress bar
    pbar = tqdm(total=len(all_files), desc="Processing files")

    with ThreadPoolExecutor(max_workers=3) as executor:
        # Submit all files to the executor and store future objects
        futures = [executor.submit(process_file, file) for file in all_files]
        
        # As tasks complete, update the progress bar
        for future in as_completed(futures):
            pbar.update(1)  # Update the progress bar by one for each task completed

    # Ensure the progress bar is closed upon completion
    pbar.close()

process_files_in_directory("/home/marius/data/it-sec-research-extracted/IT sec research")


Processing files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 4868/4877 [04:02<00:00, 20.10it/s][A

Processing files:   0%|                                                                                                                                                                   | 1/4877 [00:00<16:09,  5.03it/s][A
Processing files:   5%|███████▎                                                                                                                                                        | 221/4877 [00:00<00:06, 731.58it/s][A
Processing files:   6%|█████████▊                                                                                                                                                      | 301/4877 [00:01<00:30, 149.67it/s][A
Processing files:   7%|███████████▍                                                                       

## Chunking of the texts

The texts need to get chunked (pre-processing) before the embedding process.

In [51]:
os.chdir("/home/marius/source/bookworm")

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text_data():
    
    with open('/home/marius/source/bookworm/export.txt') as f:
        text_notes = f.read()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=100,
        chunk_overlap=20,
        length_function=len
    )
    
    chunks = text_splitter.create_documents([text_notes])
    print(f'Now you have {len(chunks)} chunks')
    return chunks
    
chunks = chunk_text_data()

Now you have 723845 chunks


### Embedding costs - why no OpenAI?

The OpenAI API has a cost for the embeddings.
At this point there seems to be no way to pre-estimate the costs reliably. 
The following calculation is flawed:

In [7]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('gpt-4')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: { (0.03 / 1_000) * total_tokens}')
    
print_embedding_cost(chunks)

Total Tokens: 15769414
Embedding Cost in USD: 473.08241999999996


## Use Hugging Face Embeddings Sentence Transformers

* use a self-hosted on-premises model for the embedding and vectorization
* configure it for the use with the CPU

This model is from the Beijing Academy of Artificial Intelligence
* https://huggingface.co/BAAI/bge-large-en-v1.5 
* https://huggingface.co/docs/transformers/model_doc/auto 

It will produce embeddings of 1024 dimensions, roughly 500 less than OpenAI.

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

modelPath = "BAAI/bge-large-en-v1.5"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

In [25]:
# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
# this model requires sentence_transformers

embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
 )


In [39]:
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)
print(len(vector))

[0.02669697254896164, 0.017016947269439697, -0.0252668596804142, 0.027562297880649567, -0.036536093801259995, -0.04702713340520859, -0.008289276622235775, 0.027264799922704697, 0.0193538386374712, 0.09645088016986847, 0.04814787581562996, 0.020076178014278412, 0.04018039628863335, -0.05319904163479805, -0.0018393148202449083, 0.0004135535564273596, 0.0056005921214818954, -0.0007755732513032854, -0.01713593304157257, 0.046697087585926056, 0.02949507348239422, -0.002135329181328416, -0.06421148031949997, -0.0025980528444051743, -0.002175381872802973, -0.016258634626865387, 0.03560303896665573, 0.025036070495843887, 0.03057222068309784, 0.07080159336328506, -0.03819547966122627, -0.009489326737821102, 0.018622957170009613, -0.02314341999590397, -0.01928570494055748, 0.008893653750419617, 0.020412752404808998, -0.0026669444050639868, -0.04702509567141533, -0.07154359668493271, -0.00985659845173359, 0.0010672420030459762, 0.007422462571412325, -0.027616797015070915, -0.037166301161050797, -

## Batch process the embedding

Many data-science tasks require to split a larger processing operation into batch jobs.
Like in the good old Mainframe days.

The vector DB: https://github.com/asg017/sqlite-vss 
Basis: https://faiss.ai/ - a library for efficient similarity search and clustering of dense vectors.

We add vectors of 1024 dimensions per chunk (sentence, line break delimited) to the DB.
The processing is done in batches of 50 chunks, using 3 threads.

In [33]:
# we can use concurrent processing to speed up the embedding process
from concurrent.futures import ThreadPoolExecutor, as_completed

from langchain_community.vectorstores import SQLiteVSS
import os
from tqdm.notebook import tqdm  # Import tqdm for notebook
from typing import List
from langchain.schema.document import Document

In [38]:
def add_texts_in_batches(batch: List[Document], sqlite_table: str = "evernote", embeddings=embeddings) -> None:
    """
    Using type hints is a good idea here, because error messages get swallowed by the ThreadPoolExecutor.
    
    The exception handling serves the same purpose. 
    
    Create a new connection to the database within each thread,
    then add a batch of texts. This ensures that the SQLite connection
    is used in the thread where it was created.
    
    This needs sqlite-vss . 
    """
    try:
        # Create a new connection for each batch
        # sqlite isn't thread-safe afaik
        
        connection = SQLiteVSS.create_connection(db_file="/home/marius/source/bookworm/evernote.db")

        local_db = SQLiteVSS(
            table=sqlite_table,
            embedding=embeddings,
            connection=connection
        )
        local_db.add_documents(batch)
        # sqlite here doesn't get closed.

    except Exception as e:
        print(f"Exception occurred in add_texts_in_batches: {e}")

def divide_chunks(chunks, n):
    """
    Divide and conquer
    """
    for i in range(0, len(chunks), n):
        yield chunks[i:i + n]

def vectorize_data_in_batches(chunks, embeddings):
    num_workers = 3
    batch_size = 50  # Adjust based on your needs and memory constraints

    batches = list(divide_chunks(chunks, batch_size))

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Wrap the 'submit' calls with 'tqdm' for immediate feedback
        futures = [executor.submit(add_texts_in_batches, batch) for batch in batches]
        
        # Setup the tqdm progress bar
        progress_bar = tqdm(total=len(futures), desc="Processing batches")
        
        for future in as_completed(futures):
            # Each time a future completes, update the progress
            progress_bar.update(1)
        
        progress_bar.close()  # Ensure the progress bar is closed at the end

    print("All texts have been added to the database.")
    
vectorize_data_in_batches(chunks=chunks, embeddings=embeddings)

Processing batches:   0%|          | 0/14477 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [ ]:
# todo : documents go to different table. 
# todo: similarity queries