First status of bookworm

main
Marius Ciepluch 2024-03-21 16:24:46 +00:00
parent 5374e44c44
commit d3e0ca2f1f
1 changed files with 93 additions and 26 deletions

View File

@ -4,7 +4,10 @@
"cell_type": "markdown",
"id": "18d62071e34b0d53",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n",
@ -23,7 +26,10 @@
"cell_type": "markdown",
"id": "a8c8692786d83c00",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Dependencies\n",
@ -40,7 +46,10 @@
"end_time": "2024-03-21T15:10:31.827945Z",
"start_time": "2024-03-21T15:10:29.646399Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -69,7 +78,10 @@
"cell_type": "markdown",
"id": "297746c807e95fbf",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"* pikepdf is used to repair some PDFs"
@ -84,7 +96,10 @@
"end_time": "2024-03-21T15:12:47.900384Z",
"start_time": "2024-03-21T15:12:45.782477Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -113,7 +128,10 @@
"cell_type": "markdown",
"id": "7c7a7f6b0db3719e",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"* pypdf with all features is needed because this DB consists of 100+ PDFs "
@ -128,7 +146,10 @@
"end_time": "2024-03-21T15:17:00.760871Z",
"start_time": "2024-03-21T15:16:58.635484Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -157,7 +178,10 @@
"cell_type": "markdown",
"id": "ce1350d2d6e3ed63",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Text extraction\n",
@ -179,7 +203,10 @@
"end_time": "2024-03-17T15:34:05.847778Z",
"start_time": "2024-03-17T15:25:49.787814Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1038,7 +1065,10 @@
"cell_type": "markdown",
"id": "e1bcc07f980c865f",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Chunking of the texts\n",
@ -1055,7 +1085,10 @@
"end_time": "2024-03-17T16:13:14.479469Z",
"start_time": "2024-03-17T16:13:14.476765Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1071,7 +1104,10 @@
"end_time": "2024-03-21T15:17:53.867414Z",
"start_time": "2024-03-21T15:17:32.731232Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1107,7 +1143,10 @@
"cell_type": "markdown",
"id": "aea7ceb111fed5f3",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"### Embedding costs - why no OpenAI?\n",
@ -1126,7 +1165,10 @@
"end_time": "2024-03-21T15:18:51.003585Z",
"start_time": "2024-03-21T15:18:31.411234Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1153,7 +1195,10 @@
"cell_type": "markdown",
"id": "8012516604037e2f",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Use Hugging Face Embeddings Sentence Transformers\n",
@ -1177,7 +1222,10 @@
"end_time": "2024-03-21T15:19:15.167038Z",
"start_time": "2024-03-21T15:19:15.031139Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1201,7 +1249,10 @@
"end_time": "2024-03-21T15:42:28.163005Z",
"start_time": "2024-03-21T15:42:26.222594Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1224,7 +1275,10 @@
"end_time": "2024-03-21T16:18:45.930652Z",
"start_time": "2024-03-21T16:18:42.989032Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1246,7 +1300,10 @@
"cell_type": "markdown",
"id": "b347fb5ee68daf60",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Batch process the embedding\n",
@ -1270,7 +1327,10 @@
"end_time": "2024-03-21T16:04:44.572979Z",
"start_time": "2024-03-21T16:04:43.521107Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1293,7 +1353,10 @@
"end_time": "2024-03-21T16:10:22.121211Z",
"start_time": "2024-03-21T16:08:20.585372Z"
},
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
@ -1397,9 +1460,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f01a969c4aedac8",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -1410,21 +1477,21 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,