added FAISS MMR query and Ollama Mistral LLM prompt and reply, with in-mem cache by LangChain

This commit is contained in:
Marius Ciepluch 2024-04-05 12:01:32 +00:00
parent 8dd968968b
commit d16d897502

View File

@ -5,7 +5,10 @@
"id": "18d62071e34b0d53",
"metadata": {
"collapsed": false,
"id": "18d62071e34b0d53"
"id": "18d62071e34b0d53",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n"
@ -21,12 +24,11 @@
"\n",
"## Features\n",
"\n",
"* vectorize text, html files, pdfs and docx into one vector DB, split in tables (sqlite vss)\n",
"* vectorize text, html files, pdfs and docx into one vector store (FAISS)\n",
"* use local self-hosted embeddings (CPU or GPU computed)\n",
" * for sentences\n",
"* query a local sqlite vss vector db, use cache from LangChain (sqlite)\n",
"* use OpenAI API and (Ollama on-prem self-hosted) Mistral for the response processing\n",
"* compare with LLMware Bling"
"* query a local vector store, use cache from LangChain (in-memory)\n",
"* use Ollama on-prem self-hosted Mistral for the response processing / prompt engineering"
]
},
{
@ -46,7 +48,10 @@
"id": "94517a27e3148ff4",
"metadata": {
"collapsed": false,
"id": "94517a27e3148ff4"
"id": "94517a27e3148ff4",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"# Setup and configuration\n",
@ -185,7 +190,10 @@
"id": "a8c8692786d83c00",
"metadata": {
"collapsed": false,
"id": "a8c8692786d83c00"
"id": "a8c8692786d83c00",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Select key dependencies\n",
@ -235,7 +243,10 @@
"id": "297746c807e95fbf",
"metadata": {
"collapsed": false,
"id": "297746c807e95fbf"
"id": "297746c807e95fbf",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"* `pikepdf` is used to repair some PDFs"
@ -283,7 +294,10 @@
"id": "7c7a7f6b0db3719e",
"metadata": {
"collapsed": false,
"id": "7c7a7f6b0db3719e"
"id": "7c7a7f6b0db3719e",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"* `pypdf` with all features is needed because this DB consists of 100+ PDFs"
@ -421,7 +435,10 @@
"id": "ce1350d2d6e3ed63",
"metadata": {
"collapsed": false,
"id": "ce1350d2d6e3ed63"
"id": "ce1350d2d6e3ed63",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Text extraction\n",
@ -617,7 +634,10 @@
"id": "e1bcc07f980c865f",
"metadata": {
"collapsed": false,
"id": "e1bcc07f980c865f"
"id": "e1bcc07f980c865f",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"# Chunking of the texts\n",
@ -682,7 +702,10 @@
"end_time": "2024-04-05T11:28:29.590616Z",
"start_time": "2024-04-05T11:28:29.586268Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -705,7 +728,10 @@
"id": "aea7ceb111fed5f3",
"metadata": {
"collapsed": false,
"id": "aea7ceb111fed5f3"
"id": "aea7ceb111fed5f3",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"### Embedding costs - why no OpenAI?\n",
@ -752,7 +778,10 @@
"id": "8012516604037e2f",
"metadata": {
"collapsed": false,
"id": "8012516604037e2f"
"id": "8012516604037e2f",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"# Use Hugging Face Embeddings Sentence Transformers\n",
@ -1077,7 +1106,10 @@
"id": "b347fb5ee68daf60",
"metadata": {
"collapsed": false,
"id": "b347fb5ee68daf60"
"id": "b347fb5ee68daf60",
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Batch process the embedding\n",
@ -1268,7 +1300,10 @@
"end_time": "2024-04-05T11:02:12.762744Z",
"start_time": "2024-04-05T11:02:12.759789Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1284,7 +1319,10 @@
"end_time": "2024-04-05T11:11:46.382509Z",
"start_time": "2024-04-05T11:10:10.900581Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1304,9 +1342,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "43458ad9399324dd",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1339,7 +1381,10 @@
"end_time": "2024-04-05T11:33:25.027401Z",
"start_time": "2024-04-05T11:33:24.742258Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1389,7 +1434,10 @@
"end_time": "2024-04-05T11:31:58.672502Z",
"start_time": "2024-04-05T11:31:58.284632Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1415,7 +1463,10 @@
"end_time": "2024-04-05T11:40:04.650321Z",
"start_time": "2024-04-05T11:40:00.463436Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1436,7 +1487,10 @@
"cell_type": "markdown",
"id": "b8ad09a6a2b98e12",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"# Use the FAISS index with Mistral"
@ -1451,7 +1505,10 @@
"end_time": "2024-04-05T11:41:25.823973Z",
"start_time": "2024-04-05T11:41:24.910564Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1468,7 +1525,10 @@
"cell_type": "markdown",
"id": "f75b4231f798edec",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Pass MMR search results to Mistral\n",
@ -1487,7 +1547,10 @@
"end_time": "2024-04-05T11:44:35.703318Z",
"start_time": "2024-04-05T11:44:34.995829Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1505,7 +1568,10 @@
"end_time": "2024-04-05T11:49:52.931101Z",
"start_time": "2024-04-05T11:49:42.877730Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1580,14 +1646,14 @@
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"pygments_lexer": "ipython3",
"version": "3.11.8"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {