mirror of
https://github.com/norandom/project_bookworm.git
synced 2024-11-24 09:03:43 +00:00
825 lines
27 KiB
Plaintext
825 lines
27 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "18d62071e34b0d53",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n",
|
|
"\n",
|
|
"## Features\n",
|
|
"\n",
|
|
"* vectorize text, html files, pdfs and docx into one vector DB, split in tables (sqlite vss)\n",
|
|
"* use local self-hosted embeddings (CPU or GPU computed)\n",
|
|
" * for sentences \n",
|
|
"* query a local sqlite vss vector db, use cache from LangChain (sqlite)\n",
|
|
"* use OpenAI API and (Ollama on-prem self-hosted) Mistral for the response processing\n",
|
|
"* compare with LLMware Bling \n",
|
|
"\n",
|
|
"## Anti-Features\n",
|
|
"\n",
|
|
"* due to cost reasons the OpenAI embeddings don't get used. So sorry :p"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "94517a27e3148ff4",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"# Configuration"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "fd9747a54ea8fcef",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:35:54.949214Z",
|
|
"start_time": "2024-04-04T10:35:54.945013Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import sys\n",
|
|
"import os\n",
|
|
"IN_COLAB = 'google.colab' in sys.modules\n",
|
|
"\n",
|
|
"if not IN_COLAB:\n",
|
|
" # The Evernote DB path containing the extracted data.\n",
|
|
" extracted_evernote_db = \"/home/marius/data/it-sec-research-extracted/IT sec research\"\n",
|
|
"\n",
|
|
" # Output paths containing the Evernote text notes or documents data.\n",
|
|
" # These get generated by the data extraction process\n",
|
|
" output_path_extracted_notes = \"/home/marius/source/bookworm/export.txt\"\n",
|
|
" output_path_extracted_docs = \"/home/marius/source/bookworm/export.documents.txt\"\n",
|
|
"\n",
|
|
" # Resulting DB or vector store path.\n",
|
|
" result_db = \"/home/marius/source/bookworm/evernote.db\"\n",
|
|
"\n",
|
|
"else:\n",
|
|
" # For the Goog Colab env\n",
|
|
" output_path_extracted_notes = \"/content/export.txt\"\n",
|
|
" output_path_extracted_docs = \"/content/export.documents.txt\"\n",
|
|
" result_db = \"/content/evernote.db\"\n",
|
|
"\n",
|
|
"# To suppress some warnings\n",
|
|
"import os\n",
|
|
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"True\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a8c8692786d83c00",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"## Dependencies\n",
|
|
"\n",
|
|
"* Cryptography is used to handle some PDF functions here (signatures)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "bb34db1ea75a1edf",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:08:32.520341Z",
|
|
"start_time": "2024-04-04T10:08:30.353678Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Name: cryptography\r\n",
|
|
"Version: 42.0.5\r\n",
|
|
"Summary: cryptography is a package which provides cryptographic recipes and primitives to Python developers.\r\n",
|
|
"Home-page: \r\n",
|
|
"Author: \r\n",
|
|
"Author-email: The Python Cryptographic Authority and individual contributors <cryptography-dev@python.org>\r\n",
|
|
"License: Apache-2.0 OR BSD-3-Clause\r\n",
|
|
"Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages\r\n",
|
|
"Requires: cffi\r\n",
|
|
"Required-by: \r\n",
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%pip show cryptography"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "297746c807e95fbf",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"* pikepdf is used to repair some PDFs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "ebc8af0183532fc2",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:08:34.665865Z",
|
|
"start_time": "2024-04-04T10:08:32.522020Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Name: pikepdf\r\n",
|
|
"Version: 8.13.0\r\n",
|
|
"Summary: Read and write PDFs with Python, powered by qpdf\r\n",
|
|
"Home-page: \r\n",
|
|
"Author: \r\n",
|
|
"Author-email: \"James R. Barlow\" <james@purplerock.ca>\r\n",
|
|
"License: MPL-2.0\r\n",
|
|
"Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages\r\n",
|
|
"Requires: Deprecated, lxml, packaging, Pillow\r\n",
|
|
"Required-by: \r\n",
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%pip show pikepdf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7c7a7f6b0db3719e",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"* pypdf with all features is needed because this DB consists of 100+ PDFs "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "779f81e2ab00f73c",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:08:37.436449Z",
|
|
"start_time": "2024-04-04T10:08:35.269255Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Name: pypdf\r\n",
|
|
"Version: 4.0.2\r\n",
|
|
"Summary: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files\r\n",
|
|
"Home-page: \r\n",
|
|
"Author: \r\n",
|
|
"Author-email: Mathieu Fenniak <biziqe@mathieu.fenniak.net>\r\n",
|
|
"License: \r\n",
|
|
"Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages\r\n",
|
|
"Requires: \r\n",
|
|
"Required-by: \r\n",
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%pip show \"pypdf\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "de3f715519fda6c4",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:08:39.729429Z",
|
|
"start_time": "2024-04-04T10:08:37.438498Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Name: torch\r\n",
|
|
"Version: 2.2.1+cpu\r\n",
|
|
"Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration\r\n",
|
|
"Home-page: https://pytorch.org/\r\n",
|
|
"Author: PyTorch Team\r\n",
|
|
"Author-email: packages@pytorch.org\r\n",
|
|
"License: BSD-3\r\n",
|
|
"Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages\r\n",
|
|
"Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions\r\n",
|
|
"Required-by: sentence-transformers, torchaudio, torchvision\r\n",
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%pip show torch"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ce1350d2d6e3ed63",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"## Text extraction\n",
|
|
"\n",
|
|
"* Here the html and text data is extracted into one txt file\n",
|
|
"* The PDF and DOCX data is extracted into another txt file\n",
|
|
"\n",
|
|
"This will be used for weighted data fusion later.\n",
|
|
"\n",
|
|
"* the texts are normalized: \n",
|
|
" * unicode normalization\n",
|
|
" * surrogate characters get replaced\n",
|
|
" * html gets converted to text\n",
|
|
" * pdfs get repaired\n",
|
|
" * docx files get read\n",
|
|
"\n",
|
|
"* exceptions get handled (UTF-16 issues, PDF reference errors)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "b557444b8b1d4839",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T09:25:39.388933Z",
|
|
"start_time": "2024-04-04T09:25:39.320902Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "ModuleNotFoundError",
|
|
"evalue": "No module named 'html2text'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[7], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01municodedata\u001b[39;00m \u001b[38;5;66;03m# to normalize text\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mhtml2text\u001b[39;00m \u001b[38;5;66;03m# to convert html to text\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PyPDFLoader, Docx2txtLoader\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpikepdf\u001b[39;00m \u001b[38;5;66;03m# to repair PDFs\u001b[39;00m\n",
|
|
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'html2text'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import glob\n",
|
|
"import os\n",
|
|
"\n",
|
|
"import unicodedata # to normalize text\n",
|
|
"import html2text # to convert html to text\n",
|
|
"from langchain.document_loaders import PyPDFLoader, Docx2txtLoader\n",
|
|
"import pikepdf # to repair PDFs\n",
|
|
"from pathlib import Path\n",
|
|
"from tqdm.notebook import tqdm\n",
|
|
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
|
"\n",
|
|
"def convert_html_to_text(html_blob: str) -> str:\n",
|
|
" \"\"\"\n",
|
|
" Converts a html blob into a string.\n",
|
|
" \"\"\"\n",
|
|
" h = html2text.HTML2Text()\n",
|
|
" h.mark_code = True\n",
|
|
" h.escape_snob = True\n",
|
|
" h.unicode_snob = True\n",
|
|
" # h.use_automatic_links = True \n",
|
|
" h.images_as_html = True\n",
|
|
" h.single_line_break = True\n",
|
|
" h.ignore_links = True\n",
|
|
" return h.handle(html_blob)\n",
|
|
"\n",
|
|
"def normalize_text(txt_blob: str) -> str:\n",
|
|
" \"\"\"\n",
|
|
" Normalize a text blob using NFKD normalization.\n",
|
|
" \"\"\"\n",
|
|
" return unicodedata.normalize(\"NFKD\", txt_blob)\n",
|
|
"\n",
|
|
"def repair_pdf(file_path: str) -> bool:\n",
|
|
" \"\"\"\n",
|
|
" Attempts to repair a PDF file using pikepdf.\n",
|
|
" \"\"\"\n",
|
|
" try:\n",
|
|
" with pikepdf.open(file_path, allow_overwriting_input=True) as pdf:\n",
|
|
" pdf.save(file_path)\n",
|
|
" return True\n",
|
|
" except pikepdf.PdfError as e:\n",
|
|
" print(f\"Failed to repair PDF {file_path}: {e}\")\n",
|
|
" return False\n",
|
|
"\n",
|
|
"def read_and_convert_file(file_path: str, is_html: bool, is_pdf: bool, is_docx: bool) -> str:\n",
|
|
" \"\"\"\n",
|
|
" Reads and converts a file from HTML, PDF, DOCX, or plain text to text.\n",
|
|
" :param file_path: \n",
|
|
" :param is_html: \n",
|
|
" :param is_pdf: \n",
|
|
" :param is_docx: \n",
|
|
" :return: \n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" content = \"\"\n",
|
|
" if is_html:\n",
|
|
" try:\n",
|
|
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
|
" content = file.read()\n",
|
|
" return convert_html_to_text(content)\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error reading {file_path}: {e}\")\n",
|
|
" return \"\"\n",
|
|
"\n",
|
|
" elif is_pdf:\n",
|
|
" try:\n",
|
|
" loader = PyPDFLoader(file_path)\n",
|
|
" # ... fixes \"Multiple definitions in dictionary at byte 0xb32 for key /ExtGState\" error\n",
|
|
" documents = loader.load()\n",
|
|
" content = \"\\n\".join(doc.page_content for doc in documents if hasattr(doc, 'page_content'))\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error loading PDF {file_path}: {e}. Attempting to repair...\")\n",
|
|
" if repair_pdf(file_path):\n",
|
|
" try:\n",
|
|
" loader = PyPDFLoader(file_path)\n",
|
|
" documents = loader.load()\n",
|
|
" content = \"\\n\".join(doc.page_content for doc in documents if hasattr(doc, 'page_content'))\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Failed to process PDF {file_path} after repair: {e}\")\n",
|
|
" return \"\"\n",
|
|
" return normalize_text(content)\n",
|
|
"\n",
|
|
" elif is_docx:\n",
|
|
" try:\n",
|
|
" loader = Docx2txtLoader(file_path)\n",
|
|
" content = loader.load()\n",
|
|
" if isinstance(content, list):\n",
|
|
" content = \"\\n\".join(content)\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error reading DOCX {file_path}: {e}\")\n",
|
|
" return \"\"\n",
|
|
" return normalize_text(content)\n",
|
|
"\n",
|
|
" else: # For plain text files\n",
|
|
" try:\n",
|
|
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
|
" return normalize_text(file.read())\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error reading {file_path}: {e}\")\n",
|
|
" return \"\"\n",
|
|
"\n",
|
|
"def sanitize_text(text):\n",
|
|
" \"\"\"\n",
|
|
" Removes or replaces surrogate characters from a string.\n",
|
|
" \"\"\"\n",
|
|
" return text.encode('utf-8', 'replace').decode('utf-8')\n",
|
|
"\n",
|
|
"def append_to_output(data: str, is_pdf: bool, is_docx: bool, output_path: str):\n",
|
|
" \"\"\"\n",
|
|
" Appends sanitized data to an output file.\n",
|
|
" \"\"\"\n",
|
|
" sanitized_data = sanitize_text(data)\n",
|
|
" if is_pdf or is_docx:\n",
|
|
" output_path = str(Path(output_path).with_suffix('')) + \".documents.txt\"\n",
|
|
" \n",
|
|
" with open(output_path, \"a\", encoding='utf-8') as output_file:\n",
|
|
" output_file.write(sanitized_data)\n",
|
|
"\n",
|
|
"def process_file(file):\n",
|
|
" is_html = file.endswith('.html')\n",
|
|
" is_pdf = file.endswith('.pdf')\n",
|
|
" is_docx = file.endswith('.docx')\n",
|
|
" \n",
|
|
" file_content = read_and_convert_file(file, is_html, is_pdf, is_docx)\n",
|
|
" append_to_output(file_content, is_pdf, is_docx, output_path=output_path)\n",
|
|
"\n",
|
|
"def process_files_in_directory(directory: str):\n",
|
|
" txt_html_files = glob.glob(os.path.join(directory, \"*.txt\")) + glob.glob(os.path.join(directory, \"*.html\"))\n",
|
|
" pdf_docx_files = glob.glob(os.path.join(directory, \"img\", \"*.pdf\")) + glob.glob(os.path.join(directory, \"img\", \"*.docx\"))\n",
|
|
" all_files = txt_html_files + pdf_docx_files\n",
|
|
"\n",
|
|
" # Initialize the progress bar\n",
|
|
" pbar = tqdm(total=len(all_files), desc=\"Processing files\")\n",
|
|
"\n",
|
|
" with ThreadPoolExecutor(max_workers=3) as executor:\n",
|
|
" # Submit all files to the executor and store future objects\n",
|
|
" futures = [executor.submit(process_file, file) for file in all_files]\n",
|
|
" \n",
|
|
" # As tasks complete, update the progress bar\n",
|
|
" for future in as_completed(futures):\n",
|
|
" pbar.update(1) # Update the progress bar by one for each task completed\n",
|
|
"\n",
|
|
" # Ensure the progress bar is closed upon completion\n",
|
|
" pbar.close()\n",
|
|
"\n",
|
|
"process_files_in_directory(extracted_evernote_db)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e1bcc07f980c865f",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"## Chunking of the texts\n",
|
|
"\n",
|
|
"The texts need to get chunked (pre-processing) before the embedding process."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "de8d9f18d8342c57",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:09:23.408646Z",
|
|
"start_time": "2024-04-04T10:08:56.104045Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Now you have 723845 chunks in /home/marius/source/bookworm/export.txt\n",
|
|
"Now you have 151259 chunks in /home/marius/source/bookworm/export.documents.txt\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
|
"\n",
|
|
"def chunk_text_data(txt_file=output_path_extracted_notes):\n",
|
|
" \n",
|
|
" with open(txt_file) as f:\n",
|
|
" text_notes = f.read()\n",
|
|
" \n",
|
|
" text_splitter = RecursiveCharacterTextSplitter(\n",
|
|
" chunk_size=100,\n",
|
|
" chunk_overlap=20,\n",
|
|
" length_function=len\n",
|
|
" )\n",
|
|
" \n",
|
|
" chunks = text_splitter.create_documents([text_notes])\n",
|
|
" print(f'Now you have {len(chunks)} chunks in {txt_file}')\n",
|
|
" return chunks\n",
|
|
" \n",
|
|
"# chunk individual text file containing the data\n",
|
|
"text_chunks = chunk_text_data(txt_file=output_path_extracted_notes)\n",
|
|
"doc_chunks = chunk_text_data(txt_file=output_path_extracted_docs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "aea7ceb111fed5f3",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"### Embedding costs - why no OpenAI?\n",
|
|
"\n",
|
|
"The OpenAI API has a cost for the embeddings.\n",
|
|
"At this point there seems to be no way to pre-estimate the costs reliably. \n",
|
|
"The following calculation is probably flawed:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "afb2c8feb9ca0bb4",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def print_embedding_cost(texts):\n",
|
|
" import tiktoken\n",
|
|
" enc = tiktoken.encoding_for_model('gpt-4')\n",
|
|
" total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])\n",
|
|
" print(f'Total Tokens: {total_tokens}')\n",
|
|
" print(f'Embedding Cost in USD: { (0.03 / 1_000) * total_tokens}')\n",
|
|
" \n",
|
|
"print_embedding_cost(text_chunks)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8012516604037e2f",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"## Use Hugging Face Embeddings Sentence Transformers\n",
|
|
"\n",
|
|
"Here we:\n",
|
|
"\n",
|
|
"* use a self-hosted on-premises model for the embedding and vectorization\n",
|
|
"* configure it for the use with the CPU or GPU\n",
|
|
"\n",
|
|
"This model is from the Beijing Academy of Artificial Intelligence\n",
|
|
"* https://huggingface.co/BAAI/bge-large-en-v1.5 \n",
|
|
"* It uses: https://huggingface.co/docs/transformers/model_doc/auto \n",
|
|
"\n",
|
|
"It will produce embeddings of 1024 dimensions, roughly 500 less than OpenAI Embeddings."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "3081256c9cf22780",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:09:29.687485Z",
|
|
"start_time": "2024-04-04T10:09:23.410187Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"No CUDA available\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import torch\n",
|
|
"use_cuda = torch.cuda.is_available()\n",
|
|
"\n",
|
|
"USE_GPU=False\n",
|
|
"\n",
|
|
"if use_cuda:\n",
|
|
" print('__CUDNN VERSION:', torch.backends.cudnn.version())\n",
|
|
" print('__Number CUDA Devices:', torch.cuda.device_count())\n",
|
|
" print('__CUDA Device Name:',torch.cuda.get_device_name(0))\n",
|
|
" print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)\n",
|
|
" USE_GPU=True\n",
|
|
" print(\"GPU enabled\")\n",
|
|
" \n",
|
|
"if not use_cuda:\n",
|
|
" print('No CUDA available')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "c1ca979bbc1610bb",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:09:29.889360Z",
|
|
"start_time": "2024-04-04T10:09:29.688832Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
|
"\n",
|
|
"# pre-trained model path\n",
|
|
"modelPath = \"BAAI/bge-large-en-v1.5\"\n",
|
|
"\n",
|
|
"# Create a dictionary with model configuration options, specifying to use the CPU or GPU for computations\n",
|
|
"if not USE_GPU:\n",
|
|
" model_kwargs = {'device':'cpu'}\n",
|
|
"else:\n",
|
|
" model_kwargs = {}\n",
|
|
"\n",
|
|
"# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to True\n",
|
|
"encode_kwargs = {'normalize_embeddings': True}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "3c2b9cd67f161714",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:09:55.733575Z",
|
|
"start_time": "2024-04-04T10:09:34.059018Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Initialize an instance of HuggingFaceEmbeddings with the specified parameters\n",
|
|
"# this model requires sentence_transformers\n",
|
|
"\n",
|
|
"embeddings = HuggingFaceEmbeddings(\n",
|
|
" model_name=modelPath, # Provide the pre-trained model's path\n",
|
|
" model_kwargs=model_kwargs, # Pass the model configuration options\n",
|
|
" encode_kwargs=encode_kwargs # Pass the encoding options\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "3b9ff8cad49442cf",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:10:01.717769Z",
|
|
"start_time": "2024-04-04T10:09:58.740831Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1024 dimensions are going to be used\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"vector = embeddings.embed_query(text_chunks[0].page_content)\n",
|
|
"# print(vector)\n",
|
|
"n_dimensions = len(vector)\n",
|
|
"print(n_dimensions, \" dimensions are going to be used\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b347fb5ee68daf60",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"source": [
|
|
"## Batch process the embedding\n",
|
|
"\n",
|
|
"Many data-science tasks require to split a larger processing operation into batch jobs.\n",
|
|
"Like in the good old Mainframe days.\n",
|
|
"\n",
|
|
"The vector DB: https://github.com/asg017/sqlite-vss \n",
|
|
"Basis: https://faiss.ai/ - a library for efficient similarity search and clustering of dense vectors.\n",
|
|
"\n",
|
|
"We add vectors of 1024 dimensions per chunk (sentence, line break delimited) to the DB.\n",
|
|
"The processing is done in batches of 50 chunks, using 3 threads."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "b03bfcb6c666db1",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:10:08.134514Z",
|
|
"start_time": "2024-04-04T10:10:07.895943Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
|
"import os\n",
|
|
"\n",
|
|
"from tqdm.notebook import tqdm # Import tqdm for notebook\n",
|
|
"from typing import List\n",
|
|
"from langchain.schema.document import Document\n",
|
|
"\n",
|
|
"from langchain_community.vectorstores import FAISS"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"id": "e6ffc345c26298ad",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-04-04T10:32:48.905517Z",
|
|
"start_time": "2024-04-04T10:30:48.115043Z"
|
|
},
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "bbc83a7dfc1945eb9c967ce0cc85a6b0",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"Processing batches: 0%| | 0/14477 [00:00<?, ?it/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"KeyboardInterrupt\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def add_texts_in_batches(batch: List[Document], sqlite_table: str = \"evernote\", embeddings=embeddings) -> None:\n",
|
|
" \"\"\"\n",
|
|
" Using type hints is a good idea here, because error messages get swallowed by the ThreadPoolExecutor.\n",
|
|
"\n",
|
|
" The exception handling serves the same purpose.\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" try:\n",
|
|
" db = FAISS.from_documents(batch, embeddings)\n",
|
|
" return db\n",
|
|
"\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Exception occurred in add_texts_in_batches: {e}\")\n",
|
|
"\n",
|
|
"def divide_chunks(chunks, n):\n",
|
|
" \"\"\"\n",
|
|
" Divide and conquer\n",
|
|
" \"\"\"\n",
|
|
" for i in range(0, len(chunks), n):\n",
|
|
" yield chunks[i:i + n]\n",
|
|
"\n",
|
|
"\n",
|
|
"def vectorize_data_in_batches(chunks, embeddings):\n",
|
|
" num_workers = 3\n",
|
|
" batch_size = 50 # Adjust based on your needs and memory constraints\n",
|
|
"\n",
|
|
" batches = list(divide_chunks(chunks, batch_size))\n",
|
|
" faiss_db = None # List to collect the returned db objects\n",
|
|
"\n",
|
|
" with ThreadPoolExecutor(max_workers=num_workers) as executor:\n",
|
|
" # Submit all the batches for processing\n",
|
|
" futures = {executor.submit(add_texts_in_batches, batch, embeddings=embeddings): batch for batch in batches}\n",
|
|
"\n",
|
|
" # Setup the tqdm progress bar\n",
|
|
" progress_bar = tqdm(total=len(futures), desc=\"Processing batches\")\n",
|
|
"\n",
|
|
" for future in as_completed(futures):\n",
|
|
" # Each time a future completes, update the progress and collect the result\n",
|
|
" progress_bar.update(1)\n",
|
|
" try:\n",
|
|
" db_result = future.result() # This is where you get the returned value from add_texts_in_batches\n",
|
|
" if faiss_db is not None:\n",
|
|
" faiss_db = db_result.merge_from(faiss_db)\n",
|
|
" else:\n",
|
|
" faiss_db = db_result\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\"An error occurred: {e}\")\n",
|
|
"\n",
|
|
" progress_bar.close() # Ensure the progress bar is closed at the end\n",
|
|
"\n",
|
|
" print(\"All texts have been added to the database.\")\n",
|
|
" faiss_db.save_local(\"faiss_index\")\n",
|
|
" \n",
|
|
"vectorize_data_in_batches(chunks=text_chunks, embeddings=embeddings)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 2
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython2",
|
|
"version": "2.7.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|