mirror of
https://github.com/norandom/project_bookworm.git
synced 2024-11-24 17:13:42 +00:00
added FAISS MMR query and Ollama Mistral LLM prompt and reply, with in-mem cache by LangChain
This commit is contained in:
parent
8dd968968b
commit
d16d897502
@ -5,7 +5,10 @@
|
|||||||
"id": "18d62071e34b0d53",
|
"id": "18d62071e34b0d53",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "18d62071e34b0d53"
|
"id": "18d62071e34b0d53",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n"
|
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n"
|
||||||
@ -21,12 +24,11 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"## Features\n",
|
"## Features\n",
|
||||||
"\n",
|
"\n",
|
||||||
"* vectorize text, html files, pdfs and docx into one vector DB, split in tables (sqlite vss)\n",
|
"* vectorize text, html files, pdfs and docx into one vector store (FAISS)\n",
|
||||||
"* use local self-hosted embeddings (CPU or GPU computed)\n",
|
"* use local self-hosted embeddings (CPU or GPU computed)\n",
|
||||||
" * for sentences\n",
|
" * for sentences\n",
|
||||||
"* query a local sqlite vss vector db, use cache from LangChain (sqlite)\n",
|
"* query a local vector store, use cache from LangChain (in-memory)\n",
|
||||||
"* use OpenAI API and (Ollama on-prem self-hosted) Mistral for the response processing\n",
|
"* use Ollama on-prem self-hosted Mistral for the response processing / prompt engineering"
|
||||||
"* compare with LLMware Bling"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -46,7 +48,10 @@
|
|||||||
"id": "94517a27e3148ff4",
|
"id": "94517a27e3148ff4",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "94517a27e3148ff4"
|
"id": "94517a27e3148ff4",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Setup and configuration\n",
|
"# Setup and configuration\n",
|
||||||
@ -185,7 +190,10 @@
|
|||||||
"id": "a8c8692786d83c00",
|
"id": "a8c8692786d83c00",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "a8c8692786d83c00"
|
"id": "a8c8692786d83c00",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## Select key dependencies\n",
|
"## Select key dependencies\n",
|
||||||
@ -235,7 +243,10 @@
|
|||||||
"id": "297746c807e95fbf",
|
"id": "297746c807e95fbf",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "297746c807e95fbf"
|
"id": "297746c807e95fbf",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"* `pikepdf` is used to repair some PDFs"
|
"* `pikepdf` is used to repair some PDFs"
|
||||||
@ -283,7 +294,10 @@
|
|||||||
"id": "7c7a7f6b0db3719e",
|
"id": "7c7a7f6b0db3719e",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "7c7a7f6b0db3719e"
|
"id": "7c7a7f6b0db3719e",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"* `pypdf` with all features is needed because this DB consists of 100+ PDFs"
|
"* `pypdf` with all features is needed because this DB consists of 100+ PDFs"
|
||||||
@ -421,7 +435,10 @@
|
|||||||
"id": "ce1350d2d6e3ed63",
|
"id": "ce1350d2d6e3ed63",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "ce1350d2d6e3ed63"
|
"id": "ce1350d2d6e3ed63",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## Text extraction\n",
|
"## Text extraction\n",
|
||||||
@ -617,7 +634,10 @@
|
|||||||
"id": "e1bcc07f980c865f",
|
"id": "e1bcc07f980c865f",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "e1bcc07f980c865f"
|
"id": "e1bcc07f980c865f",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Chunking of the texts\n",
|
"# Chunking of the texts\n",
|
||||||
@ -682,7 +702,10 @@
|
|||||||
"end_time": "2024-04-05T11:28:29.590616Z",
|
"end_time": "2024-04-05T11:28:29.590616Z",
|
||||||
"start_time": "2024-04-05T11:28:29.586268Z"
|
"start_time": "2024-04-05T11:28:29.586268Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -705,7 +728,10 @@
|
|||||||
"id": "aea7ceb111fed5f3",
|
"id": "aea7ceb111fed5f3",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "aea7ceb111fed5f3"
|
"id": "aea7ceb111fed5f3",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"### Embedding costs - why no OpenAI?\n",
|
"### Embedding costs - why no OpenAI?\n",
|
||||||
@ -752,7 +778,10 @@
|
|||||||
"id": "8012516604037e2f",
|
"id": "8012516604037e2f",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "8012516604037e2f"
|
"id": "8012516604037e2f",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Use Hugging Face Embeddings Sentence Transformers\n",
|
"# Use Hugging Face Embeddings Sentence Transformers\n",
|
||||||
@ -1077,7 +1106,10 @@
|
|||||||
"id": "b347fb5ee68daf60",
|
"id": "b347fb5ee68daf60",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"id": "b347fb5ee68daf60"
|
"id": "b347fb5ee68daf60",
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## Batch process the embedding\n",
|
"## Batch process the embedding\n",
|
||||||
@ -1268,7 +1300,10 @@
|
|||||||
"end_time": "2024-04-05T11:02:12.762744Z",
|
"end_time": "2024-04-05T11:02:12.762744Z",
|
||||||
"start_time": "2024-04-05T11:02:12.759789Z"
|
"start_time": "2024-04-05T11:02:12.759789Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -1284,7 +1319,10 @@
|
|||||||
"end_time": "2024-04-05T11:11:46.382509Z",
|
"end_time": "2024-04-05T11:11:46.382509Z",
|
||||||
"start_time": "2024-04-05T11:10:10.900581Z"
|
"start_time": "2024-04-05T11:10:10.900581Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -1304,9 +1342,13 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"id": "43458ad9399324dd",
|
"id": "43458ad9399324dd",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -1339,7 +1381,10 @@
|
|||||||
"end_time": "2024-04-05T11:33:25.027401Z",
|
"end_time": "2024-04-05T11:33:25.027401Z",
|
||||||
"start_time": "2024-04-05T11:33:24.742258Z"
|
"start_time": "2024-04-05T11:33:24.742258Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -1389,7 +1434,10 @@
|
|||||||
"end_time": "2024-04-05T11:31:58.672502Z",
|
"end_time": "2024-04-05T11:31:58.672502Z",
|
||||||
"start_time": "2024-04-05T11:31:58.284632Z"
|
"start_time": "2024-04-05T11:31:58.284632Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -1415,7 +1463,10 @@
|
|||||||
"end_time": "2024-04-05T11:40:04.650321Z",
|
"end_time": "2024-04-05T11:40:04.650321Z",
|
||||||
"start_time": "2024-04-05T11:40:00.463436Z"
|
"start_time": "2024-04-05T11:40:00.463436Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -1436,7 +1487,10 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "b8ad09a6a2b98e12",
|
"id": "b8ad09a6a2b98e12",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Use the FAISS index with Mistral"
|
"# Use the FAISS index with Mistral"
|
||||||
@ -1451,7 +1505,10 @@
|
|||||||
"end_time": "2024-04-05T11:41:25.823973Z",
|
"end_time": "2024-04-05T11:41:25.823973Z",
|
||||||
"start_time": "2024-04-05T11:41:24.910564Z"
|
"start_time": "2024-04-05T11:41:24.910564Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -1468,7 +1525,10 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "f75b4231f798edec",
|
"id": "f75b4231f798edec",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## Pass MMR search results to Mistral\n",
|
"## Pass MMR search results to Mistral\n",
|
||||||
@ -1487,7 +1547,10 @@
|
|||||||
"end_time": "2024-04-05T11:44:35.703318Z",
|
"end_time": "2024-04-05T11:44:35.703318Z",
|
||||||
"start_time": "2024-04-05T11:44:34.995829Z"
|
"start_time": "2024-04-05T11:44:34.995829Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -1505,7 +1568,10 @@
|
|||||||
"end_time": "2024-04-05T11:49:52.931101Z",
|
"end_time": "2024-04-05T11:49:52.931101Z",
|
||||||
"start_time": "2024-04-05T11:49:42.877730Z"
|
"start_time": "2024-04-05T11:49:42.877730Z"
|
||||||
},
|
},
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -1580,14 +1646,14 @@
|
|||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
"name": "ipython",
|
"name": "ipython",
|
||||||
"version": 2
|
"version": 3
|
||||||
},
|
},
|
||||||
"file_extension": ".py",
|
"file_extension": ".py",
|
||||||
"mimetype": "text/x-python",
|
"mimetype": "text/x-python",
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython2",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "2.7.6"
|
"version": "3.11.8"
|
||||||
},
|
},
|
||||||
"widgets": {
|
"widgets": {
|
||||||
"application/vnd.jupyter.widget-state+json": {
|
"application/vnd.jupyter.widget-state+json": {
|
||||||
|
Loading…
Reference in New Issue
Block a user