mirror of
https://github.com/norandom/project_bookworm.git
synced 2025-01-13 01:53:43 +00:00
First status of bookworm
This commit is contained in:
parent
5374e44c44
commit
d3e0ca2f1f
@ -4,7 +4,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "18d62071e34b0d53",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n",
|
||||
@ -23,7 +26,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "a8c8692786d83c00",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Dependencies\n",
|
||||
@ -40,7 +46,10 @@
|
||||
"end_time": "2024-03-21T15:10:31.827945Z",
|
||||
"start_time": "2024-03-21T15:10:29.646399Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -69,7 +78,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "297746c807e95fbf",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"* pikepdf is used to repair some PDFs"
|
||||
@ -84,7 +96,10 @@
|
||||
"end_time": "2024-03-21T15:12:47.900384Z",
|
||||
"start_time": "2024-03-21T15:12:45.782477Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -113,7 +128,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "7c7a7f6b0db3719e",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"* pypdf with all features is needed because this DB consists of 100+ PDFs "
|
||||
@ -128,7 +146,10 @@
|
||||
"end_time": "2024-03-21T15:17:00.760871Z",
|
||||
"start_time": "2024-03-21T15:16:58.635484Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -157,7 +178,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "ce1350d2d6e3ed63",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Text extraction\n",
|
||||
@ -179,7 +203,10 @@
|
||||
"end_time": "2024-03-17T15:34:05.847778Z",
|
||||
"start_time": "2024-03-17T15:25:49.787814Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1038,7 +1065,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "e1bcc07f980c865f",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Chunking of the texts\n",
|
||||
@ -1055,7 +1085,10 @@
|
||||
"end_time": "2024-03-17T16:13:14.479469Z",
|
||||
"start_time": "2024-03-17T16:13:14.476765Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1071,7 +1104,10 @@
|
||||
"end_time": "2024-03-21T15:17:53.867414Z",
|
||||
"start_time": "2024-03-21T15:17:32.731232Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1107,7 +1143,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "aea7ceb111fed5f3",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Embedding costs - why no OpenAI?\n",
|
||||
@ -1126,7 +1165,10 @@
|
||||
"end_time": "2024-03-21T15:18:51.003585Z",
|
||||
"start_time": "2024-03-21T15:18:31.411234Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1153,7 +1195,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "8012516604037e2f",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Use Hugging Face Embeddings Sentence Transformers\n",
|
||||
@ -1177,7 +1222,10 @@
|
||||
"end_time": "2024-03-21T15:19:15.167038Z",
|
||||
"start_time": "2024-03-21T15:19:15.031139Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1201,7 +1249,10 @@
|
||||
"end_time": "2024-03-21T15:42:28.163005Z",
|
||||
"start_time": "2024-03-21T15:42:26.222594Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1224,7 +1275,10 @@
|
||||
"end_time": "2024-03-21T16:18:45.930652Z",
|
||||
"start_time": "2024-03-21T16:18:42.989032Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1246,7 +1300,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "b347fb5ee68daf60",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Batch process the embedding\n",
|
||||
@ -1270,7 +1327,10 @@
|
||||
"end_time": "2024-03-21T16:04:44.572979Z",
|
||||
"start_time": "2024-03-21T16:04:43.521107Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1293,7 +1353,10 @@
|
||||
"end_time": "2024-03-21T16:10:22.121211Z",
|
||||
"start_time": "2024-03-21T16:08:20.585372Z"
|
||||
},
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1397,9 +1460,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5f01a969c4aedac8",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -1410,21 +1477,21 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Loading…
Reference in New Issue
Block a user