mirror of
https://github.com/norandom/project_bookworm.git
synced 2024-11-24 01:03:42 +00:00
added FAISS MMR query and Ollama Mistral LLM prompt and reply, with in-mem cache by LangChain
This commit is contained in:
parent
8dd968968b
commit
d16d897502
@ -5,7 +5,10 @@
|
||||
"id": "18d62071e34b0d53",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "18d62071e34b0d53"
|
||||
"id": "18d62071e34b0d53",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n"
|
||||
@ -21,12 +24,11 @@
|
||||
"\n",
|
||||
"## Features\n",
|
||||
"\n",
|
||||
"* vectorize text, html files, pdfs and docx into one vector DB, split in tables (sqlite vss)\n",
|
||||
"* vectorize text, html files, pdfs and docx into one vector store (FAISS)\n",
|
||||
"* use local self-hosted embeddings (CPU or GPU computed)\n",
|
||||
" * for sentences\n",
|
||||
"* query a local sqlite vss vector db, use cache from LangChain (sqlite)\n",
|
||||
"* use OpenAI API and (Ollama on-prem self-hosted) Mistral for the response processing\n",
|
||||
"* compare with LLMware Bling"
|
||||
"* query a local vector store, use cache from LangChain (in-memory)\n",
|
||||
"* use Ollama on-prem self-hosted Mistral for the response processing / prompt engineering"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -46,7 +48,10 @@
|
||||
"id": "94517a27e3148ff4",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "94517a27e3148ff4"
|
||||
"id": "94517a27e3148ff4",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Setup and configuration\n",
|
||||
@ -185,7 +190,10 @@
|
||||
"id": "a8c8692786d83c00",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "a8c8692786d83c00"
|
||||
"id": "a8c8692786d83c00",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Select key dependencies\n",
|
||||
@ -235,7 +243,10 @@
|
||||
"id": "297746c807e95fbf",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "297746c807e95fbf"
|
||||
"id": "297746c807e95fbf",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"* `pikepdf` is used to repair some PDFs"
|
||||
@ -283,7 +294,10 @@
|
||||
"id": "7c7a7f6b0db3719e",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "7c7a7f6b0db3719e"
|
||||
"id": "7c7a7f6b0db3719e",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"* `pypdf` with all features is needed because this DB consists of 100+ PDFs"
|
||||
@ -421,7 +435,10 @@
|
||||
"id": "ce1350d2d6e3ed63",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "ce1350d2d6e3ed63"
|
||||
"id": "ce1350d2d6e3ed63",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Text extraction\n",
|
||||
@ -617,7 +634,10 @@
|
||||
"id": "e1bcc07f980c865f",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "e1bcc07f980c865f"
|
||||
"id": "e1bcc07f980c865f",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Chunking of the texts\n",
|
||||
@ -682,7 +702,10 @@
|
||||
"end_time": "2024-04-05T11:28:29.590616Z",
|
||||
"start_time": "2024-04-05T11:28:29.586268Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -705,7 +728,10 @@
|
||||
"id": "aea7ceb111fed5f3",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "aea7ceb111fed5f3"
|
||||
"id": "aea7ceb111fed5f3",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Embedding costs - why no OpenAI?\n",
|
||||
@ -752,7 +778,10 @@
|
||||
"id": "8012516604037e2f",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "8012516604037e2f"
|
||||
"id": "8012516604037e2f",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Use Hugging Face Embeddings Sentence Transformers\n",
|
||||
@ -1077,7 +1106,10 @@
|
||||
"id": "b347fb5ee68daf60",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"id": "b347fb5ee68daf60"
|
||||
"id": "b347fb5ee68daf60",
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Batch process the embedding\n",
|
||||
@ -1268,7 +1300,10 @@
|
||||
"end_time": "2024-04-05T11:02:12.762744Z",
|
||||
"start_time": "2024-04-05T11:02:12.759789Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1284,7 +1319,10 @@
|
||||
"end_time": "2024-04-05T11:11:46.382509Z",
|
||||
"start_time": "2024-04-05T11:10:10.900581Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1304,9 +1342,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "43458ad9399324dd",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1339,7 +1381,10 @@
|
||||
"end_time": "2024-04-05T11:33:25.027401Z",
|
||||
"start_time": "2024-04-05T11:33:24.742258Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1389,7 +1434,10 @@
|
||||
"end_time": "2024-04-05T11:31:58.672502Z",
|
||||
"start_time": "2024-04-05T11:31:58.284632Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1415,7 +1463,10 @@
|
||||
"end_time": "2024-04-05T11:40:04.650321Z",
|
||||
"start_time": "2024-04-05T11:40:00.463436Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1436,7 +1487,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "b8ad09a6a2b98e12",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Use the FAISS index with Mistral"
|
||||
@ -1451,7 +1505,10 @@
|
||||
"end_time": "2024-04-05T11:41:25.823973Z",
|
||||
"start_time": "2024-04-05T11:41:24.910564Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1468,7 +1525,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "f75b4231f798edec",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Pass MMR search results to Mistral\n",
|
||||
@ -1487,7 +1547,10 @@
|
||||
"end_time": "2024-04-05T11:44:35.703318Z",
|
||||
"start_time": "2024-04-05T11:44:34.995829Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1505,7 +1568,10 @@
|
||||
"end_time": "2024-04-05T11:49:52.931101Z",
|
||||
"start_time": "2024-04-05T11:49:42.877730Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1580,14 +1646,14 @@
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.8"
|
||||
},
|
||||
"widgets": {
|
||||
"application/vnd.jupyter.widget-state+json": {
|
||||
|
Loading…
Reference in New Issue
Block a user