From d16d89750213ca86e2339ef599ac565369cf147d Mon Sep 17 00:00:00 2001 From: Marius Ciepluch <11855163+norandom@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:01:32 +0000 Subject: [PATCH] added FAISS MMR query and Ollama Mistral LLM prompt and reply, with in-mem cache by LangChain --- EverNote_Documents_To_FAISS_Colab_GPU.ipynb | 124 +++++++++++++++----- 1 file changed, 95 insertions(+), 29 deletions(-) diff --git a/EverNote_Documents_To_FAISS_Colab_GPU.ipynb b/EverNote_Documents_To_FAISS_Colab_GPU.ipynb index 3102a27..8fccd7e 100644 --- a/EverNote_Documents_To_FAISS_Colab_GPU.ipynb +++ b/EverNote_Documents_To_FAISS_Colab_GPU.ipynb @@ -5,7 +5,10 @@ "id": "18d62071e34b0d53", "metadata": { "collapsed": false, - "id": "18d62071e34b0d53" + "id": "18d62071e34b0d53", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n" @@ -21,12 +24,11 @@ "\n", "## Features\n", "\n", - "* vectorize text, html files, pdfs and docx into one vector DB, split in tables (sqlite vss)\n", + "* vectorize text, html files, pdfs and docx into one vector store (FAISS)\n", "* use local self-hosted embeddings (CPU or GPU computed)\n", " * for sentences\n", - "* query a local sqlite vss vector db, use cache from LangChain (sqlite)\n", - "* use OpenAI API and (Ollama on-prem self-hosted) Mistral for the response processing\n", - "* compare with LLMware Bling" + "* query a local vector store, use cache from LangChain (in-memory)\n", + "* use Ollama on-prem self-hosted Mistral for the response processing / prompt engineering" ] }, { @@ -46,7 +48,10 @@ "id": "94517a27e3148ff4", "metadata": { "collapsed": false, - "id": "94517a27e3148ff4" + "id": "94517a27e3148ff4", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "# Setup and configuration\n", @@ -185,7 +190,10 @@ "id": "a8c8692786d83c00", "metadata": { "collapsed": false, - "id": "a8c8692786d83c00" + "id": "a8c8692786d83c00", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Select key dependencies\n", @@ -235,7 +243,10 @@ "id": "297746c807e95fbf", "metadata": { "collapsed": false, - "id": "297746c807e95fbf" + "id": "297746c807e95fbf", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "* `pikepdf` is used to repair some PDFs" @@ -283,7 +294,10 @@ "id": "7c7a7f6b0db3719e", "metadata": { "collapsed": false, - "id": "7c7a7f6b0db3719e" + "id": "7c7a7f6b0db3719e", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "* `pypdf` with all features is needed because this DB consists of 100+ PDFs" @@ -421,7 +435,10 @@ "id": "ce1350d2d6e3ed63", "metadata": { "collapsed": false, - "id": "ce1350d2d6e3ed63" + "id": "ce1350d2d6e3ed63", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Text extraction\n", @@ -617,7 +634,10 @@ "id": "e1bcc07f980c865f", "metadata": { "collapsed": false, - "id": "e1bcc07f980c865f" + "id": "e1bcc07f980c865f", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "# Chunking of the texts\n", @@ -682,7 +702,10 @@ "end_time": "2024-04-05T11:28:29.590616Z", "start_time": "2024-04-05T11:28:29.586268Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -705,7 +728,10 @@ "id": "aea7ceb111fed5f3", "metadata": { "collapsed": false, - "id": "aea7ceb111fed5f3" + "id": "aea7ceb111fed5f3", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "### Embedding costs - why no OpenAI?\n", @@ -752,7 +778,10 @@ "id": "8012516604037e2f", "metadata": { "collapsed": false, - "id": "8012516604037e2f" + "id": "8012516604037e2f", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "# Use Hugging Face Embeddings Sentence Transformers\n", @@ -1077,7 +1106,10 @@ "id": "b347fb5ee68daf60", "metadata": { "collapsed": false, - "id": "b347fb5ee68daf60" + "id": "b347fb5ee68daf60", + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Batch process the embedding\n", @@ -1268,7 +1300,10 @@ "end_time": "2024-04-05T11:02:12.762744Z", "start_time": "2024-04-05T11:02:12.759789Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1284,7 +1319,10 @@ "end_time": "2024-04-05T11:11:46.382509Z", "start_time": "2024-04-05T11:10:10.900581Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1304,9 +1342,13 @@ }, { "cell_type": "code", + "execution_count": null, "id": "43458ad9399324dd", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1339,7 +1381,10 @@ "end_time": "2024-04-05T11:33:25.027401Z", "start_time": "2024-04-05T11:33:24.742258Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1389,7 +1434,10 @@ "end_time": "2024-04-05T11:31:58.672502Z", "start_time": "2024-04-05T11:31:58.284632Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1415,7 +1463,10 @@ "end_time": "2024-04-05T11:40:04.650321Z", "start_time": "2024-04-05T11:40:00.463436Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1436,7 +1487,10 @@ "cell_type": "markdown", "id": "b8ad09a6a2b98e12", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "# Use the FAISS index with Mistral" @@ -1451,7 +1505,10 @@ "end_time": "2024-04-05T11:41:25.823973Z", "start_time": "2024-04-05T11:41:24.910564Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1468,7 +1525,10 @@ "cell_type": "markdown", "id": "f75b4231f798edec", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Pass MMR search results to Mistral\n", @@ -1487,7 +1547,10 @@ "end_time": "2024-04-05T11:44:35.703318Z", "start_time": "2024-04-05T11:44:34.995829Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1505,7 +1568,10 @@ "end_time": "2024-04-05T11:49:52.931101Z", "start_time": "2024-04-05T11:49:42.877730Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1580,14 +1646,14 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.11.8" }, "widgets": { "application/vnd.jupyter.widget-state+json": {