Created using Colaboratory

main
Marius Ciepluch 2024-04-04 17:53:57 +02:00
parent 564ed54344
commit eef5b798b4
1 changed files with 70 additions and 53 deletions

View File

@ -50,6 +50,8 @@
"source": [
"import sys\n",
"import os\n",
"import subprocess\n",
"\n",
"IN_COLAB = 'google.colab' in sys.modules\n",
"\n",
"if not IN_COLAB:\n",
@ -69,6 +71,12 @@
" output_path_extracted_notes = \"/content/export.txt\"\n",
" output_path_extracted_docs = \"/content/export.documents.txt\"\n",
" result_db = \"/content/evernote.db\"\n",
" subprocess.run('''\n",
" source <(curl -s https://raw.githubusercontent.com/norandom/project_bookworm/main/scripts/prepare_colab_env.sh)\n",
" ''',\n",
" shell=True, check=True,\n",
" executable='/bin/bash')\n",
"\n",
"\n",
"# To suppress some warnings\n",
"import os\n",
@ -76,16 +84,14 @@
]
},
{
"cell_type": "code",
"cell_type": "markdown",
"source": [
"# Controls:"
"# Checks"
],
"metadata": {
"id": "8tcn27pzvpRi"
"id": "yuhXPdN_z2cW"
},
"id": "8tcn27pzvpRi",
"execution_count": 2,
"outputs": []
"id": "yuhXPdN_z2cW"
},
{
"cell_type": "code",
@ -111,6 +117,16 @@
}
]
},
{
"cell_type": "markdown",
"source": [
"## For the progress bars in Colab"
],
"metadata": {
"id": "B02AY_Gez61T"
},
"id": "B02AY_Gez61T"
},
{
"cell_type": "code",
"source": [
@ -668,7 +684,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 28,
"id": "3081256c9cf22780",
"metadata": {
"ExecuteTime": {
@ -679,7 +695,7 @@
"base_uri": "https://localhost:8080/"
},
"id": "3081256c9cf22780",
"outputId": "5630ab81-0756-4a31-dfd5-91f3e34365c1"
"outputId": "0a02f0bc-42ce-4f50-e670-fd8ca48111a9"
},
"outputs": [
{
@ -698,7 +714,7 @@
"import torch\n",
"use_cuda = torch.cuda.is_available()\n",
"\n",
"USE_GPU=False\n",
"USE_GPU=True\n",
"\n",
"if use_cuda:\n",
" print('__CUDNN VERSION:', torch.backends.cudnn.version())\n",
@ -709,7 +725,8 @@
" print(\"GPU enabled\")\n",
"\n",
"if not use_cuda:\n",
" print('No CUDA available')"
" print('No CUDA available')\n",
" USE_GPU=False\n"
]
},
{
@ -867,33 +884,33 @@
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"97a1b60642a848c28b7c9654dccd1de6",
"8e9c21cbf4e84b228589b04c28031986",
"c6b0f42e62b942ff93b0dd203b71afaf",
"24988c14693f4b739617f6b148981712",
"5c56bada895c45f8b955db0b322a30b4",
"b83ec3c178fa47a3b5466974280db85b",
"5d799e20928448c18071eecc9f513789",
"4a251ff30d6448f3b256692453637a6a",
"90238269c43647caa0a8731f0290d64e",
"4208c14a250c40feb246c67e8141ca99",
"976ae07a9b19499991f724a7e40f7e6d"
"7639624db8b44b7194b33e1587015e7b",
"d1f1b83ff52b4f339aaf7e3472e88f5f",
"a4c9656af7644c8794eea9cd95cdbb38",
"eece25d48bd94132bd6e9c25001dd0a3",
"77829cd4ef2341c58bd37ce7fb173fbf",
"78e5715e33af4af9a72f348a3cff7a45",
"89b0f4fbd3c542c6abb5ea2ba0b937fc",
"6013ba1807144ee2b0b4c83d42cf1977",
"20db3230c722479db16949f232e23fc8",
"54e9a4e180d74916b620c46cf4da6546",
"b1a5928250a94055a95a026804807cf0"
]
},
"id": "e6ffc345c26298ad",
"outputId": "68ff33d9-8c75-4701-96ef-1484a0e5d9d4"
"outputId": "c63ea480-2c4f-447e-e948-f51ec0ea6224"
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Processing batches: 0%| | 0/966 [00:00<?, ?it/s]"
"Processing batches: 0%| | 0/1448 [00:00<?, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "97a1b60642a848c28b7c9654dccd1de6"
"model_id": "7639624db8b44b7194b33e1587015e7b"
}
},
"metadata": {}
@ -925,7 +942,7 @@
"def vectorize_data_in_batches(chunks, embeddings):\n",
"\n",
" num_workers = 3\n",
" batch_size = 750 # Adjust based on your needs and memory constraints\n",
" batch_size = 500 # Adjust based on your needs and memory constraints\n",
"\n",
" batches = list(divide_chunks(chunks, batch_size))\n",
" faiss_db = None\n",
@ -970,11 +987,11 @@
"print(type(faiss))"
],
"metadata": {
"id": "v6bhYHU5_9oo",
"outputId": "01bbcac4-01dc-4efe-e9e9-31c8d2b6ca56",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"id": "v6bhYHU5_9oo",
"outputId": "01bbcac4-01dc-4efe-e9e9-31c8d2b6ca56"
},
"id": "v6bhYHU5_9oo",
"execution_count": 15,
@ -1013,7 +1030,7 @@
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"97a1b60642a848c28b7c9654dccd1de6": {
"7639624db8b44b7194b33e1587015e7b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
@ -1028,14 +1045,14 @@
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_8e9c21cbf4e84b228589b04c28031986",
"IPY_MODEL_c6b0f42e62b942ff93b0dd203b71afaf",
"IPY_MODEL_24988c14693f4b739617f6b148981712"
"IPY_MODEL_d1f1b83ff52b4f339aaf7e3472e88f5f",
"IPY_MODEL_a4c9656af7644c8794eea9cd95cdbb38",
"IPY_MODEL_eece25d48bd94132bd6e9c25001dd0a3"
],
"layout": "IPY_MODEL_5c56bada895c45f8b955db0b322a30b4"
"layout": "IPY_MODEL_77829cd4ef2341c58bd37ce7fb173fbf"
}
},
"8e9c21cbf4e84b228589b04c28031986": {
"d1f1b83ff52b4f339aaf7e3472e88f5f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
@ -1050,13 +1067,13 @@
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b83ec3c178fa47a3b5466974280db85b",
"layout": "IPY_MODEL_78e5715e33af4af9a72f348a3cff7a45",
"placeholder": "",
"style": "IPY_MODEL_5d799e20928448c18071eecc9f513789",
"value": "Processingbatches:0%"
"style": "IPY_MODEL_89b0f4fbd3c542c6abb5ea2ba0b937fc",
"value": "Processingbatches:66%"
}
},
"c6b0f42e62b942ff93b0dd203b71afaf": {
"a4c9656af7644c8794eea9cd95cdbb38": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
@ -1072,15 +1089,15 @@
"bar_style": "",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_4a251ff30d6448f3b256692453637a6a",
"max": 966,
"layout": "IPY_MODEL_6013ba1807144ee2b0b4c83d42cf1977",
"max": 1448,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_90238269c43647caa0a8731f0290d64e",
"value": 3
"style": "IPY_MODEL_20db3230c722479db16949f232e23fc8",
"value": 957
}
},
"24988c14693f4b739617f6b148981712": {
"eece25d48bd94132bd6e9c25001dd0a3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
@ -1095,13 +1112,13 @@
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_4208c14a250c40feb246c67e8141ca99",
"layout": "IPY_MODEL_54e9a4e180d74916b620c46cf4da6546",
"placeholder": "",
"style": "IPY_MODEL_976ae07a9b19499991f724a7e40f7e6d",
"value": "3/966[00:52&lt;3:11:12,11.91s/it]"
"style": "IPY_MODEL_b1a5928250a94055a95a026804807cf0",
"value": "957/1448[3:10:16&lt;1:30:31,11.06s/it]"
}
},
"5c56bada895c45f8b955db0b322a30b4": {
"77829cd4ef2341c58bd37ce7fb173fbf": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
@ -1153,7 +1170,7 @@
"width": null
}
},
"b83ec3c178fa47a3b5466974280db85b": {
"78e5715e33af4af9a72f348a3cff7a45": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
@ -1205,7 +1222,7 @@
"width": null
}
},
"5d799e20928448c18071eecc9f513789": {
"89b0f4fbd3c542c6abb5ea2ba0b937fc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
@ -1220,7 +1237,7 @@
"description_width": ""
}
},
"4a251ff30d6448f3b256692453637a6a": {
"6013ba1807144ee2b0b4c83d42cf1977": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
@ -1272,7 +1289,7 @@
"width": null
}
},
"90238269c43647caa0a8731f0290d64e": {
"20db3230c722479db16949f232e23fc8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
@ -1288,7 +1305,7 @@
"description_width": ""
}
},
"4208c14a250c40feb246c67e8141ca99": {
"54e9a4e180d74916b620c46cf4da6546": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
@ -1340,7 +1357,7 @@
"width": null
}
},
"976ae07a9b19499991f724a7e40f7e6d": {
"b1a5928250a94055a95a026804807cf0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",