mirror of
https://github.com/norandom/project_bookworm.git
synced 2024-11-22 08:43:42 +00:00
1380 lines
49 KiB
Plaintext
1380 lines
49 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "18d62071e34b0d53",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "18d62071e34b0d53"
|
||
},
|
||
"source": [
|
||
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n",
|
||
"\n",
|
||
"## Features\n",
|
||
"\n",
|
||
"* vectorize text, html files, pdfs and docx into one vector DB, split in tables (sqlite vss)\n",
|
||
"* use local self-hosted embeddings (CPU or GPU computed)\n",
|
||
" * for sentences\n",
|
||
"* query a local sqlite vss vector db, use cache from LangChain (sqlite)\n",
|
||
"* use OpenAI API and (Ollama on-prem self-hosted) Mistral for the response processing\n",
|
||
"* compare with LLMware Bling\n",
|
||
"\n",
|
||
"## Anti-Features\n",
|
||
"\n",
|
||
"* due to cost reasons the OpenAI embeddings don't get used. So sorry :p"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "94517a27e3148ff4",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "94517a27e3148ff4"
|
||
},
|
||
"source": [
|
||
"# Configuration"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "fd9747a54ea8fcef",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:35:54.949214Z",
|
||
"start_time": "2024-04-04T10:35:54.945013Z"
|
||
},
|
||
"id": "fd9747a54ea8fcef"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import sys\n",
|
||
"import os\n",
|
||
"import subprocess\n",
|
||
"\n",
|
||
"IN_COLAB = 'google.colab' in sys.modules\n",
|
||
"\n",
|
||
"if not IN_COLAB:\n",
|
||
" # The Evernote DB path containing the extracted data.\n",
|
||
" extracted_evernote_db = \"/home/marius/data/it-sec-research-extracted/IT sec research\"\n",
|
||
"\n",
|
||
" # Output paths containing the Evernote text notes or documents data.\n",
|
||
" # These get generated by the data extraction process\n",
|
||
" output_path_extracted_notes = \"/home/marius/source/bookworm/export.txt\"\n",
|
||
" output_path_extracted_docs = \"/home/marius/source/bookworm/export.documents.txt\"\n",
|
||
"\n",
|
||
" # Resulting DB or vector store path.\n",
|
||
" result_db = \"/home/marius/source/bookworm/evernote.db\"\n",
|
||
"\n",
|
||
"else:\n",
|
||
" # For the Goog Colab env\n",
|
||
" output_path_extracted_notes = \"/content/export.txt\"\n",
|
||
" output_path_extracted_docs = \"/content/export.documents.txt\"\n",
|
||
" result_db = \"/content/evernote.db\"\n",
|
||
" subprocess.run('''\n",
|
||
" source <(curl -s https://raw.githubusercontent.com/norandom/project_bookworm/main/scripts/prepare_colab_env.sh)\n",
|
||
" ''',\n",
|
||
" shell=True, check=True,\n",
|
||
" executable='/bin/bash')\n",
|
||
"\n",
|
||
"\n",
|
||
"# To suppress some warnings\n",
|
||
"import os\n",
|
||
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"True\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# Checks"
|
||
],
|
||
"metadata": {
|
||
"id": "yuhXPdN_z2cW"
|
||
},
|
||
"id": "yuhXPdN_z2cW"
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"print(output_path_extracted_notes)"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "6SPPaVEet9EO",
|
||
"outputId": "ceb17148-270d-44d0-d23a-9690015a3cb6"
|
||
},
|
||
"id": "6SPPaVEet9EO",
|
||
"execution_count": 3,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"/content/export.txt\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"## For the progress bars in Colab"
|
||
],
|
||
"metadata": {
|
||
"id": "B02AY_Gez61T"
|
||
},
|
||
"id": "B02AY_Gez61T"
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"%reload_ext autoreload\n",
|
||
"%autoreload 2"
|
||
],
|
||
"metadata": {
|
||
"id": "XGYNhuvrvnUD"
|
||
},
|
||
"id": "XGYNhuvrvnUD",
|
||
"execution_count": 4,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a8c8692786d83c00",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "a8c8692786d83c00"
|
||
},
|
||
"source": [
|
||
"## Dependencies\n",
|
||
"\n",
|
||
"* Cryptography is used to handle some PDF functions here (signatures)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "bb34db1ea75a1edf",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:08:32.520341Z",
|
||
"start_time": "2024-04-04T10:08:30.353678Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "bb34db1ea75a1edf",
|
||
"outputId": "25aac151-5cae-44e7-887b-27008e986821"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Name: cryptography\n",
|
||
"Version: 42.0.5\n",
|
||
"Summary: cryptography is a package which provides cryptographic recipes and primitives to Python developers.\n",
|
||
"Home-page: \n",
|
||
"Author: \n",
|
||
"Author-email: The Python Cryptographic Authority and individual contributors <cryptography-dev@python.org>\n",
|
||
"License: Apache-2.0 OR BSD-3-Clause\n",
|
||
"Location: /usr/local/lib/python3.10/dist-packages\n",
|
||
"Requires: cffi\n",
|
||
"Required-by: pyOpenSSL\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%pip show cryptography"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "297746c807e95fbf",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "297746c807e95fbf"
|
||
},
|
||
"source": [
|
||
"* pikepdf is used to repair some PDFs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ebc8af0183532fc2",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:08:34.665865Z",
|
||
"start_time": "2024-04-04T10:08:32.522020Z"
|
||
},
|
||
"id": "ebc8af0183532fc2",
|
||
"outputId": "2398386a-d6d5-4574-9416-c7f8f92a082c"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Name: pikepdf\r\n",
|
||
"Version: 8.13.0\r\n",
|
||
"Summary: Read and write PDFs with Python, powered by qpdf\r\n",
|
||
"Home-page: \r\n",
|
||
"Author: \r\n",
|
||
"Author-email: \"James R. Barlow\" <james@purplerock.ca>\r\n",
|
||
"License: MPL-2.0\r\n",
|
||
"Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages\r\n",
|
||
"Requires: Deprecated, lxml, packaging, Pillow\r\n",
|
||
"Required-by: \r\n",
|
||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%pip show pikepdf"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "7c7a7f6b0db3719e",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "7c7a7f6b0db3719e"
|
||
},
|
||
"source": [
|
||
"* pypdf with all features is needed because this DB consists of 100+ PDFs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "779f81e2ab00f73c",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:08:37.436449Z",
|
||
"start_time": "2024-04-04T10:08:35.269255Z"
|
||
},
|
||
"id": "779f81e2ab00f73c",
|
||
"outputId": "353f67d9-0d77-45ba-85d3-37d8a651580c"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Name: pypdf\r\n",
|
||
"Version: 4.0.2\r\n",
|
||
"Summary: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files\r\n",
|
||
"Home-page: \r\n",
|
||
"Author: \r\n",
|
||
"Author-email: Mathieu Fenniak <biziqe@mathieu.fenniak.net>\r\n",
|
||
"License: \r\n",
|
||
"Location: /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages\r\n",
|
||
"Requires: \r\n",
|
||
"Required-by: \r\n",
|
||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%pip show \"pypdf\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "de3f715519fda6c4",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:08:39.729429Z",
|
||
"start_time": "2024-04-04T10:08:37.438498Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "de3f715519fda6c4",
|
||
"outputId": "858cd8ae-32d3-4373-9ac4-971e424079bb"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Name: torch\n",
|
||
"Version: 2.2.1+cu121\n",
|
||
"Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration\n",
|
||
"Home-page: https://pytorch.org/\n",
|
||
"Author: PyTorch Team\n",
|
||
"Author-email: packages@pytorch.org\n",
|
||
"License: BSD-3\n",
|
||
"Location: /usr/local/lib/python3.10/dist-packages\n",
|
||
"Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions\n",
|
||
"Required-by: fastai, sentence-transformers, torchaudio, torchdata, torchtext, torchvision\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%pip show torch"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"%pip show faiss_gpu"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "HARY_QMJvttI",
|
||
"outputId": "1a3d2e43-b3d4-46f6-a526-ac0cb44bd1e6"
|
||
},
|
||
"id": "HARY_QMJvttI",
|
||
"execution_count": 8,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Name: faiss-gpu\n",
|
||
"Version: 1.7.2\n",
|
||
"Summary: A library for efficient similarity search and clustering of dense vectors.\n",
|
||
"Home-page: https://github.com/kyamagu/faiss-wheels\n",
|
||
"Author: Kota Yamaguchi\n",
|
||
"Author-email: KotaYamaguchi1984@gmail.com\n",
|
||
"License: MIT\n",
|
||
"Location: /usr/local/lib/python3.10/dist-packages\n",
|
||
"Requires: \n",
|
||
"Required-by: \n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ce1350d2d6e3ed63",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "ce1350d2d6e3ed63"
|
||
},
|
||
"source": [
|
||
"## Text extraction\n",
|
||
"\n",
|
||
"* Here the html and text data is extracted into one txt file\n",
|
||
"* The PDF and DOCX data is extracted into another txt file\n",
|
||
"\n",
|
||
"This will be used for weighted data fusion later.\n",
|
||
"\n",
|
||
"* the texts are normalized:\n",
|
||
" * unicode normalization\n",
|
||
" * surrogate characters get replaced\n",
|
||
" * html gets converted to text\n",
|
||
" * pdfs get repaired\n",
|
||
" * docx files get read\n",
|
||
"\n",
|
||
"* exceptions get handled (UTF-16 issues, PDF reference errors)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "b557444b8b1d4839",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T09:25:39.388933Z",
|
||
"start_time": "2024-04-04T09:25:39.320902Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 394
|
||
},
|
||
"id": "b557444b8b1d4839",
|
||
"outputId": "9487736e-c65b-4b1e-bc32-f743bf73a035"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "error",
|
||
"ename": "ModuleNotFoundError",
|
||
"evalue": "No module named 'html2text'",
|
||
"traceback": [
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||
"\u001b[0;32m<ipython-input-5-03e2f5ff2219>\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0municodedata\u001b[0m \u001b[0;31m# to normalize text\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mhtml2text\u001b[0m \u001b[0;31m# to convert html to text\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mlangchain\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdocument_loaders\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPyPDFLoader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDocx2txtLoader\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpikepdf\u001b[0m \u001b[0;31m# to repair PDFs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'html2text'",
|
||
"",
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
|
||
],
|
||
"errorDetails": {
|
||
"actions": [
|
||
{
|
||
"action": "open_url",
|
||
"actionText": "Open Examples",
|
||
"url": "/notebooks/snippets/importing_libraries.ipynb"
|
||
}
|
||
]
|
||
}
|
||
}
|
||
],
|
||
"source": [
|
||
"import glob\n",
|
||
"import os\n",
|
||
"\n",
|
||
"import unicodedata # to normalize text\n",
|
||
"import html2text # to convert html to text\n",
|
||
"from langchain.document_loaders import PyPDFLoader, Docx2txtLoader\n",
|
||
"import pikepdf # to repair PDFs\n",
|
||
"from pathlib import Path\n",
|
||
"from tqdm.notebook import tqdm\n",
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"\n",
|
||
"def convert_html_to_text(html_blob: str) -> str:\n",
|
||
" \"\"\"\n",
|
||
" Converts a html blob into a string.\n",
|
||
" \"\"\"\n",
|
||
" h = html2text.HTML2Text()\n",
|
||
" h.mark_code = True\n",
|
||
" h.escape_snob = True\n",
|
||
" h.unicode_snob = True\n",
|
||
" # h.use_automatic_links = True\n",
|
||
" h.images_as_html = True\n",
|
||
" h.single_line_break = True\n",
|
||
" h.ignore_links = True\n",
|
||
" return h.handle(html_blob)\n",
|
||
"\n",
|
||
"def normalize_text(txt_blob: str) -> str:\n",
|
||
" \"\"\"\n",
|
||
" Normalize a text blob using NFKD normalization.\n",
|
||
" \"\"\"\n",
|
||
" return unicodedata.normalize(\"NFKD\", txt_blob)\n",
|
||
"\n",
|
||
"def repair_pdf(file_path: str) -> bool:\n",
|
||
" \"\"\"\n",
|
||
" Attempts to repair a PDF file using pikepdf.\n",
|
||
" \"\"\"\n",
|
||
" try:\n",
|
||
" with pikepdf.open(file_path, allow_overwriting_input=True) as pdf:\n",
|
||
" pdf.save(file_path)\n",
|
||
" return True\n",
|
||
" except pikepdf.PdfError as e:\n",
|
||
" print(f\"Failed to repair PDF {file_path}: {e}\")\n",
|
||
" return False\n",
|
||
"\n",
|
||
"def read_and_convert_file(file_path: str, is_html: bool, is_pdf: bool, is_docx: bool) -> str:\n",
|
||
" \"\"\"\n",
|
||
" Reads and converts a file from HTML, PDF, DOCX, or plain text to text.\n",
|
||
" :param file_path:\n",
|
||
" :param is_html:\n",
|
||
" :param is_pdf:\n",
|
||
" :param is_docx:\n",
|
||
" :return:\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" content = \"\"\n",
|
||
" if is_html:\n",
|
||
" try:\n",
|
||
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
||
" content = file.read()\n",
|
||
" return convert_html_to_text(content)\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"Error reading {file_path}: {e}\")\n",
|
||
" return \"\"\n",
|
||
"\n",
|
||
" elif is_pdf:\n",
|
||
" try:\n",
|
||
" loader = PyPDFLoader(file_path)\n",
|
||
" # ... fixes \"Multiple definitions in dictionary at byte 0xb32 for key /ExtGState\" error\n",
|
||
" documents = loader.load()\n",
|
||
" content = \"\\n\".join(doc.page_content for doc in documents if hasattr(doc, 'page_content'))\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"Error loading PDF {file_path}: {e}. Attempting to repair...\")\n",
|
||
" if repair_pdf(file_path):\n",
|
||
" try:\n",
|
||
" loader = PyPDFLoader(file_path)\n",
|
||
" documents = loader.load()\n",
|
||
" content = \"\\n\".join(doc.page_content for doc in documents if hasattr(doc, 'page_content'))\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"Failed to process PDF {file_path} after repair: {e}\")\n",
|
||
" return \"\"\n",
|
||
" return normalize_text(content)\n",
|
||
"\n",
|
||
" elif is_docx:\n",
|
||
" try:\n",
|
||
" loader = Docx2txtLoader(file_path)\n",
|
||
" content = loader.load()\n",
|
||
" if isinstance(content, list):\n",
|
||
" content = \"\\n\".join(content)\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"Error reading DOCX {file_path}: {e}\")\n",
|
||
" return \"\"\n",
|
||
" return normalize_text(content)\n",
|
||
"\n",
|
||
" else: # For plain text files\n",
|
||
" try:\n",
|
||
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
||
" return normalize_text(file.read())\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"Error reading {file_path}: {e}\")\n",
|
||
" return \"\"\n",
|
||
"\n",
|
||
"def sanitize_text(text):\n",
|
||
" \"\"\"\n",
|
||
" Removes or replaces surrogate characters from a string.\n",
|
||
" \"\"\"\n",
|
||
" return text.encode('utf-8', 'replace').decode('utf-8')\n",
|
||
"\n",
|
||
"def append_to_output(data: str, is_pdf: bool, is_docx: bool, output_path: str):\n",
|
||
" \"\"\"\n",
|
||
" Appends sanitized data to an output file.\n",
|
||
" \"\"\"\n",
|
||
" sanitized_data = sanitize_text(data)\n",
|
||
" if is_pdf or is_docx:\n",
|
||
" output_path = str(Path(output_path).with_suffix('')) + \".documents.txt\"\n",
|
||
"\n",
|
||
" with open(output_path, \"a\", encoding='utf-8') as output_file:\n",
|
||
" output_file.write(sanitized_data)\n",
|
||
"\n",
|
||
"def process_file(file):\n",
|
||
" is_html = file.endswith('.html')\n",
|
||
" is_pdf = file.endswith('.pdf')\n",
|
||
" is_docx = file.endswith('.docx')\n",
|
||
"\n",
|
||
" file_content = read_and_convert_file(file, is_html, is_pdf, is_docx)\n",
|
||
" append_to_output(file_content, is_pdf, is_docx, output_path=output_path)\n",
|
||
"\n",
|
||
"def process_files_in_directory(directory: str):\n",
|
||
" txt_html_files = glob.glob(os.path.join(directory, \"*.txt\")) + glob.glob(os.path.join(directory, \"*.html\"))\n",
|
||
" pdf_docx_files = glob.glob(os.path.join(directory, \"img\", \"*.pdf\")) + glob.glob(os.path.join(directory, \"img\", \"*.docx\"))\n",
|
||
" all_files = txt_html_files + pdf_docx_files\n",
|
||
"\n",
|
||
" # Initialize the progress bar\n",
|
||
" pbar = tqdm(total=len(all_files), desc=\"Processing files\")\n",
|
||
"\n",
|
||
" with ThreadPoolExecutor(max_workers=3) as executor:\n",
|
||
" # Submit all files to the executor and store future objects\n",
|
||
" futures = [executor.submit(process_file, file) for file in all_files]\n",
|
||
"\n",
|
||
" # As tasks complete, update the progress bar\n",
|
||
" for future in as_completed(futures):\n",
|
||
" pbar.update(1) # Update the progress bar by one for each task completed\n",
|
||
"\n",
|
||
" # Ensure the progress bar is closed upon completion\n",
|
||
" pbar.close()\n",
|
||
"\n",
|
||
"process_files_in_directory(extracted_evernote_db)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e1bcc07f980c865f",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "e1bcc07f980c865f"
|
||
},
|
||
"source": [
|
||
"## Chunking of the texts\n",
|
||
"\n",
|
||
"The texts need to get chunked (pre-processing) before the embedding process."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "de8d9f18d8342c57",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:09:23.408646Z",
|
||
"start_time": "2024-04-04T10:08:56.104045Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "de8d9f18d8342c57",
|
||
"outputId": "6d45ef7e-67a8-4539-bd5f-253c1e1f9c52"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Now you have 723845 chunks in /content/export.txt\n",
|
||
"Now you have 151259 chunks in /content/export.documents.txt\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||
"\n",
|
||
"def chunk_text_data(txt_file=output_path_extracted_notes):\n",
|
||
"\n",
|
||
" with open(txt_file) as f:\n",
|
||
" text_notes = f.read()\n",
|
||
"\n",
|
||
" text_splitter = RecursiveCharacterTextSplitter(\n",
|
||
" chunk_size=100,\n",
|
||
" chunk_overlap=20,\n",
|
||
" length_function=len\n",
|
||
" )\n",
|
||
"\n",
|
||
" chunks = text_splitter.create_documents([text_notes])\n",
|
||
" print(f'Now you have {len(chunks)} chunks in {txt_file}')\n",
|
||
" return chunks\n",
|
||
"\n",
|
||
"# chunk individual text file containing the data\n",
|
||
"text_chunks = chunk_text_data(txt_file=output_path_extracted_notes)\n",
|
||
"doc_chunks = chunk_text_data(txt_file=output_path_extracted_docs)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "aea7ceb111fed5f3",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "aea7ceb111fed5f3"
|
||
},
|
||
"source": [
|
||
"### Embedding costs - why no OpenAI?\n",
|
||
"\n",
|
||
"The OpenAI API has a cost for the embeddings.\n",
|
||
"At this point there seems to be no way to pre-estimate the costs reliably.\n",
|
||
"The following calculation is probably flawed:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "afb2c8feb9ca0bb4",
|
||
"metadata": {
|
||
"id": "afb2c8feb9ca0bb4"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def print_embedding_cost(texts):\n",
|
||
" import tiktoken\n",
|
||
" enc = tiktoken.encoding_for_model('gpt-4')\n",
|
||
" total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])\n",
|
||
" print(f'Total Tokens: {total_tokens}')\n",
|
||
" print(f'Embedding Cost in USD: { (0.03 / 1_000) * total_tokens}')\n",
|
||
"\n",
|
||
"print_embedding_cost(text_chunks)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8012516604037e2f",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "8012516604037e2f"
|
||
},
|
||
"source": [
|
||
"## Use Hugging Face Embeddings Sentence Transformers\n",
|
||
"\n",
|
||
"Here we:\n",
|
||
"\n",
|
||
"* use a self-hosted on-premises model for the embedding and vectorization\n",
|
||
"* configure it for the use with the CPU or GPU\n",
|
||
"\n",
|
||
"This model is from the Beijing Academy of Artificial Intelligence\n",
|
||
"* https://huggingface.co/BAAI/bge-large-en-v1.5\n",
|
||
"* It uses: https://huggingface.co/docs/transformers/model_doc/auto\n",
|
||
"\n",
|
||
"It will produce embeddings of 1024 dimensions, roughly 500 less than OpenAI Embeddings."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "3081256c9cf22780",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:09:29.687485Z",
|
||
"start_time": "2024-04-04T10:09:23.410187Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "3081256c9cf22780",
|
||
"outputId": "0a02f0bc-42ce-4f50-e670-fd8ca48111a9"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"__CUDNN VERSION: 8902\n",
|
||
"__Number CUDA Devices: 1\n",
|
||
"__CUDA Device Name: Tesla V100-SXM2-16GB\n",
|
||
"__CUDA Device Total Memory [GB]: 16.935682048\n",
|
||
"GPU enabled\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import torch\n",
|
||
"use_cuda = torch.cuda.is_available()\n",
|
||
"\n",
|
||
"USE_GPU=True\n",
|
||
"\n",
|
||
"if use_cuda:\n",
|
||
" print('__CUDNN VERSION:', torch.backends.cudnn.version())\n",
|
||
" print('__Number CUDA Devices:', torch.cuda.device_count())\n",
|
||
" print('__CUDA Device Name:',torch.cuda.get_device_name(0))\n",
|
||
" print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)\n",
|
||
" USE_GPU=True\n",
|
||
" print(\"GPU enabled\")\n",
|
||
"\n",
|
||
"if not use_cuda:\n",
|
||
" print('No CUDA available')\n",
|
||
" USE_GPU=False\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "c1ca979bbc1610bb",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:09:29.889360Z",
|
||
"start_time": "2024-04-04T10:09:29.688832Z"
|
||
},
|
||
"id": "c1ca979bbc1610bb"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||
"\n",
|
||
"# pre-trained model path\n",
|
||
"modelPath = \"BAAI/bge-large-en-v1.5\"\n",
|
||
"\n",
|
||
"# Create a dictionary with model configuration options, specifying to use the CPU or GPU for computations\n",
|
||
"if not USE_GPU:\n",
|
||
" model_kwargs = {'device':'cpu'}\n",
|
||
"else:\n",
|
||
" model_kwargs = {}\n",
|
||
"\n",
|
||
"# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to True\n",
|
||
"encode_kwargs = {'normalize_embeddings': True}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "3c2b9cd67f161714",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:09:55.733575Z",
|
||
"start_time": "2024-04-04T10:09:34.059018Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "3c2b9cd67f161714",
|
||
"outputId": "6fe941fb-04c0-4f41-829d-ad594d543c02"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stderr",
|
||
"text": [
|
||
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
|
||
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
|
||
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
|
||
"You will be able to reuse this secret in all of your notebooks.\n",
|
||
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Initialize an instance of HuggingFaceEmbeddings with the specified parameters\n",
|
||
"# this model requires sentence_transformers\n",
|
||
"\n",
|
||
"embeddings = HuggingFaceEmbeddings(\n",
|
||
" model_name=modelPath, # Provide the pre-trained model's path\n",
|
||
" model_kwargs=model_kwargs, # Pass the model configuration options\n",
|
||
" encode_kwargs=encode_kwargs # Pass the encoding options\n",
|
||
" )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "3b9ff8cad49442cf",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:10:01.717769Z",
|
||
"start_time": "2024-04-04T10:09:58.740831Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "3b9ff8cad49442cf",
|
||
"outputId": "0d53532d-a992-431f-82f2-f7b921d3479e"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"1024 dimensions are going to be used\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"vector = embeddings.embed_query(text_chunks[0].page_content)\n",
|
||
"# print(vector)\n",
|
||
"n_dimensions = len(vector)\n",
|
||
"print(n_dimensions, \" dimensions are going to be used\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "b347fb5ee68daf60",
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"id": "b347fb5ee68daf60"
|
||
},
|
||
"source": [
|
||
"## Batch process the embedding\n",
|
||
"\n",
|
||
"Many data-science tasks require to split a larger processing operation into batch jobs.\n",
|
||
"Like in the good old Mainframe days.\n",
|
||
"\n",
|
||
"The vector DB: https://github.com/asg017/sqlite-vss\n",
|
||
"Basis: https://faiss.ai/ - a library for efficient similarity search and clustering of dense vectors.\n",
|
||
"\n",
|
||
"We add vectors of 1024 dimensions per chunk (sentence, line break delimited) to the DB.\n",
|
||
"The processing is done in batches of 50 chunks, using 3 threads."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "b03bfcb6c666db1",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:10:08.134514Z",
|
||
"start_time": "2024-04-04T10:10:07.895943Z"
|
||
},
|
||
"id": "b03bfcb6c666db1"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"import os\n",
|
||
"\n",
|
||
"from tqdm.notebook import tqdm # Import tqdm for notebook\n",
|
||
"from typing import List\n",
|
||
"from langchain.schema.document import Document\n",
|
||
"\n",
|
||
"from langchain_community.vectorstores import FAISS"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e6ffc345c26298ad",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-04-04T10:32:48.905517Z",
|
||
"start_time": "2024-04-04T10:30:48.115043Z"
|
||
},
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 49,
|
||
"referenced_widgets": [
|
||
"7639624db8b44b7194b33e1587015e7b",
|
||
"d1f1b83ff52b4f339aaf7e3472e88f5f",
|
||
"a4c9656af7644c8794eea9cd95cdbb38",
|
||
"eece25d48bd94132bd6e9c25001dd0a3",
|
||
"77829cd4ef2341c58bd37ce7fb173fbf",
|
||
"78e5715e33af4af9a72f348a3cff7a45",
|
||
"89b0f4fbd3c542c6abb5ea2ba0b937fc",
|
||
"6013ba1807144ee2b0b4c83d42cf1977",
|
||
"20db3230c722479db16949f232e23fc8",
|
||
"54e9a4e180d74916b620c46cf4da6546",
|
||
"b1a5928250a94055a95a026804807cf0"
|
||
]
|
||
},
|
||
"id": "e6ffc345c26298ad",
|
||
"outputId": "c63ea480-2c4f-447e-e948-f51ec0ea6224"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "display_data",
|
||
"data": {
|
||
"text/plain": [
|
||
"Processing batches: 0%| | 0/1448 [00:00<?, ?it/s]"
|
||
],
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"version_major": 2,
|
||
"version_minor": 0,
|
||
"model_id": "7639624db8b44b7194b33e1587015e7b"
|
||
}
|
||
},
|
||
"metadata": {}
|
||
}
|
||
],
|
||
"source": [
|
||
"def add_texts_in_batches(batch: List[Document], sqlite_table: str = \"evernote\", embeddings=embeddings) -> None:\n",
|
||
" \"\"\"\n",
|
||
" Using type hints is a good idea here, because error messages get swallowed by the ThreadPoolExecutor.\n",
|
||
"\n",
|
||
" The exception handling serves the same purpose.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" try:\n",
|
||
" db = FAISS.from_documents(batch, embeddings)\n",
|
||
" return db\n",
|
||
"\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"Exception occurred in add_texts_in_batches: {e}\")\n",
|
||
"\n",
|
||
"def divide_chunks(chunks, n):\n",
|
||
" \"\"\"\n",
|
||
" Divide and conquer\n",
|
||
" \"\"\"\n",
|
||
" for i in range(0, len(chunks), n):\n",
|
||
" yield chunks[i:i + n]\n",
|
||
"\n",
|
||
"\n",
|
||
"def vectorize_data_in_batches(chunks, embeddings):\n",
|
||
"\n",
|
||
" num_workers = 3\n",
|
||
" batch_size = 500 # Adjust based on your needs and memory constraints\n",
|
||
"\n",
|
||
" batches = list(divide_chunks(chunks, batch_size))\n",
|
||
" faiss_db = None\n",
|
||
"\n",
|
||
" with ThreadPoolExecutor(max_workers=num_workers) as executor:\n",
|
||
" # Submit all the batches for processing\n",
|
||
" futures = {executor.submit(add_texts_in_batches, batch, embeddings=embeddings): batch for batch in batches}\n",
|
||
"\n",
|
||
" # Setup the tqdm progress bar\n",
|
||
" progress_bar = tqdm(total=len(futures), desc=\"Processing batches\")\n",
|
||
"\n",
|
||
" for future in as_completed(futures):\n",
|
||
" # Each time a future completes, update the progress and collect the result\n",
|
||
" progress_bar.update(1)\n",
|
||
" try:\n",
|
||
" db_result = future.result() # This is where you get the returned value from add_texts_in_batches\n",
|
||
" if faiss_db is not None:\n",
|
||
" faiss_db.merge_from(db_result)\n",
|
||
"\n",
|
||
" else:\n",
|
||
" faiss_db = db_result\n",
|
||
"\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"An error occurred: {e}\")\n",
|
||
"\n",
|
||
" progress_bar.close() # Ensure the progress bar is closed at the end\n",
|
||
"\n",
|
||
" faiss_db.save_local(\"faiss_index\")\n",
|
||
" print(\"All texts have been added to the database.\")\n",
|
||
"\n",
|
||
"\n",
|
||
"vectorize_data_in_batches(chunks=text_chunks, embeddings=embeddings)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"from langchain_community.vectorstores import FAISS\n",
|
||
"\n",
|
||
"texts = [\"FAISS is an important library\", \"LangChain supports FAISS\"]\n",
|
||
"faiss = FAISS.from_texts(texts, embeddings)\n",
|
||
"print(type(faiss))"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "v6bhYHU5_9oo",
|
||
"outputId": "01bbcac4-01dc-4efe-e9e9-31c8d2b6ca56"
|
||
},
|
||
"id": "v6bhYHU5_9oo",
|
||
"execution_count": 15,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"<class 'langchain_community.vectorstores.faiss.FAISS'>\n"
|
||
]
|
||
}
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 2
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython2",
|
||
"version": "2.7.6"
|
||
},
|
||
"colab": {
|
||
"provenance": [],
|
||
"gpuType": "V100"
|
||
},
|
||
"accelerator": "GPU",
|
||
"widgets": {
|
||
"application/vnd.jupyter.widget-state+json": {
|
||
"7639624db8b44b7194b33e1587015e7b": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_name": "HBoxModel",
|
||
"model_module_version": "1.5.0",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "HBoxModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "HBoxView",
|
||
"box_style": "",
|
||
"children": [
|
||
"IPY_MODEL_d1f1b83ff52b4f339aaf7e3472e88f5f",
|
||
"IPY_MODEL_a4c9656af7644c8794eea9cd95cdbb38",
|
||
"IPY_MODEL_eece25d48bd94132bd6e9c25001dd0a3"
|
||
],
|
||
"layout": "IPY_MODEL_77829cd4ef2341c58bd37ce7fb173fbf"
|
||
}
|
||
},
|
||
"d1f1b83ff52b4f339aaf7e3472e88f5f": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_name": "HTMLModel",
|
||
"model_module_version": "1.5.0",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "HTMLModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "HTMLView",
|
||
"description": "",
|
||
"description_tooltip": null,
|
||
"layout": "IPY_MODEL_78e5715e33af4af9a72f348a3cff7a45",
|
||
"placeholder": "",
|
||
"style": "IPY_MODEL_89b0f4fbd3c542c6abb5ea2ba0b937fc",
|
||
"value": "Processing batches: 66%"
|
||
}
|
||
},
|
||
"a4c9656af7644c8794eea9cd95cdbb38": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_name": "FloatProgressModel",
|
||
"model_module_version": "1.5.0",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "FloatProgressModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "ProgressView",
|
||
"bar_style": "",
|
||
"description": "",
|
||
"description_tooltip": null,
|
||
"layout": "IPY_MODEL_6013ba1807144ee2b0b4c83d42cf1977",
|
||
"max": 1448,
|
||
"min": 0,
|
||
"orientation": "horizontal",
|
||
"style": "IPY_MODEL_20db3230c722479db16949f232e23fc8",
|
||
"value": 957
|
||
}
|
||
},
|
||
"eece25d48bd94132bd6e9c25001dd0a3": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_name": "HTMLModel",
|
||
"model_module_version": "1.5.0",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "HTMLModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "HTMLView",
|
||
"description": "",
|
||
"description_tooltip": null,
|
||
"layout": "IPY_MODEL_54e9a4e180d74916b620c46cf4da6546",
|
||
"placeholder": "",
|
||
"style": "IPY_MODEL_b1a5928250a94055a95a026804807cf0",
|
||
"value": " 957/1448 [3:10:16<1:30:31, 11.06s/it]"
|
||
}
|
||
},
|
||
"77829cd4ef2341c58bd37ce7fb173fbf": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_name": "LayoutModel",
|
||
"model_module_version": "1.2.0",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"78e5715e33af4af9a72f348a3cff7a45": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_name": "LayoutModel",
|
||
"model_module_version": "1.2.0",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"89b0f4fbd3c542c6abb5ea2ba0b937fc": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_name": "DescriptionStyleModel",
|
||
"model_module_version": "1.5.0",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "DescriptionStyleModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "StyleView",
|
||
"description_width": ""
|
||
}
|
||
},
|
||
"6013ba1807144ee2b0b4c83d42cf1977": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_name": "LayoutModel",
|
||
"model_module_version": "1.2.0",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"20db3230c722479db16949f232e23fc8": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_name": "ProgressStyleModel",
|
||
"model_module_version": "1.5.0",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "ProgressStyleModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "StyleView",
|
||
"bar_color": null,
|
||
"description_width": ""
|
||
}
|
||
},
|
||
"54e9a4e180d74916b620c46cf4da6546": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_name": "LayoutModel",
|
||
"model_module_version": "1.2.0",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"b1a5928250a94055a95a026804807cf0": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_name": "DescriptionStyleModel",
|
||
"model_module_version": "1.5.0",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "DescriptionStyleModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "StyleView",
|
||
"description_width": ""
|
||
}
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
} |