diff --git a/EverNote_To_OpenAI.ipynb b/EverNote_To_OpenAI.ipynb index 31a4681..dd77ae4 100644 --- a/EverNote_To_OpenAI.ipynb +++ b/EverNote_To_OpenAI.ipynb @@ -4,7 +4,10 @@ "cell_type": "markdown", "id": "18d62071e34b0d53", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "# This is an experiment: create vectorized embeddings out of an EverNote DB (PDF, DOCX, HTML, TXT)\n", @@ -23,7 +26,10 @@ "cell_type": "markdown", "id": "a8c8692786d83c00", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Dependencies\n", @@ -40,7 +46,10 @@ "end_time": "2024-03-21T15:10:31.827945Z", "start_time": "2024-03-21T15:10:29.646399Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -69,7 +78,10 @@ "cell_type": "markdown", "id": "297746c807e95fbf", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "* pikepdf is used to repair some PDFs" @@ -84,7 +96,10 @@ "end_time": "2024-03-21T15:12:47.900384Z", "start_time": "2024-03-21T15:12:45.782477Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -113,7 +128,10 @@ "cell_type": "markdown", "id": "7c7a7f6b0db3719e", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "* pypdf with all features is needed because this DB consists of 100+ PDFs " @@ -128,7 +146,10 @@ "end_time": "2024-03-21T15:17:00.760871Z", "start_time": "2024-03-21T15:16:58.635484Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -157,7 +178,10 @@ "cell_type": "markdown", "id": "ce1350d2d6e3ed63", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Text extraction\n", @@ -179,7 +203,10 @@ "end_time": "2024-03-17T15:34:05.847778Z", "start_time": "2024-03-17T15:25:49.787814Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1038,7 +1065,10 @@ "cell_type": "markdown", "id": "e1bcc07f980c865f", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Chunking of the texts\n", @@ -1055,7 +1085,10 @@ "end_time": "2024-03-17T16:13:14.479469Z", "start_time": "2024-03-17T16:13:14.476765Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1071,7 +1104,10 @@ "end_time": "2024-03-21T15:17:53.867414Z", "start_time": "2024-03-21T15:17:32.731232Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1107,7 +1143,10 @@ "cell_type": "markdown", "id": "aea7ceb111fed5f3", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "### Embedding costs - why no OpenAI?\n", @@ -1126,7 +1165,10 @@ "end_time": "2024-03-21T15:18:51.003585Z", "start_time": "2024-03-21T15:18:31.411234Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1153,7 +1195,10 @@ "cell_type": "markdown", "id": "8012516604037e2f", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Use Hugging Face Embeddings Sentence Transformers\n", @@ -1177,7 +1222,10 @@ "end_time": "2024-03-21T15:19:15.167038Z", "start_time": "2024-03-21T15:19:15.031139Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1201,7 +1249,10 @@ "end_time": "2024-03-21T15:42:28.163005Z", "start_time": "2024-03-21T15:42:26.222594Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1224,7 +1275,10 @@ "end_time": "2024-03-21T16:18:45.930652Z", "start_time": "2024-03-21T16:18:42.989032Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1246,7 +1300,10 @@ "cell_type": "markdown", "id": "b347fb5ee68daf60", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Batch process the embedding\n", @@ -1270,7 +1327,10 @@ "end_time": "2024-03-21T16:04:44.572979Z", "start_time": "2024-03-21T16:04:43.521107Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1293,7 +1353,10 @@ "end_time": "2024-03-21T16:10:22.121211Z", "start_time": "2024-03-21T16:08:20.585372Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1397,9 +1460,13 @@ }, { "cell_type": "code", + "execution_count": null, "id": "5f01a969c4aedac8", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -1410,21 +1477,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.11.8" } }, "nbformat": 4,