diff --git a/LinFormer_AutoML_on_AE_sysmon_dataset_(Excel_implant_C2).ipynb b/LinFormer_AutoML_on_AE_sysmon_dataset_(Excel_implant_C2).ipynb new file mode 100644 index 0000000..023a117 --- /dev/null +++ b/LinFormer_AutoML_on_AE_sysmon_dataset_(Excel_implant_C2).ipynb @@ -0,0 +1,5505 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm", + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "13c7730f61b24661bca4b4406f488fbb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_82838b6e6693488682c97ce82ed723bf", + "IPY_MODEL_41b566d613234e03876a7c6a68431d7b", + "IPY_MODEL_dd83f610ca584acbba3738f2c137b2b9" + ], + "layout": "IPY_MODEL_ad4f22541b284758b868c95e5e8ceff0" + } + }, + "82838b6e6693488682c97ce82ed723bf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_13d9c546446a482ab6634e2c69c70685", + "placeholder": "​", + "style": "IPY_MODEL_10c31ef1ded9471cae583be49e7092e5", + "value": "100%" + } + }, + "41b566d613234e03876a7c6a68431d7b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c1ad9d79f8464e71bd79ae848c103a58", + "max": 8039037, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0de9cbe7986c49fb99ae4f3abdba42d9", + "value": 8039037 + } + }, + "dd83f610ca584acbba3738f2c137b2b9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b12aceb2e4c46eeb708d0877a9ca639", + "placeholder": "​", + "style": "IPY_MODEL_b6edc45065c7488a8b8be5e1349b27ba", + "value": " 8.04M/8.04M [00:00<00:00, 9.88MiB/s]" + } + }, + "ad4f22541b284758b868c95e5e8ceff0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "13d9c546446a482ab6634e2c69c70685": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "10c31ef1ded9471cae583be49e7092e5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c1ad9d79f8464e71bd79ae848c103a58": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0de9cbe7986c49fb99ae4f3abdba42d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6b12aceb2e4c46eeb708d0877a9ca639": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b6edc45065c7488a8b8be5e1349b27ba": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6d9d173f139a43e3a6b0b1deb2fb557b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_34569214f37b44e0802ddd403a4fa1a9", + "IPY_MODEL_2bdfcb2139bb4ae4b670c1fb49db6911", + "IPY_MODEL_b27ff271d01a49e38a32cd3ddcebe77a" + ], + "layout": "IPY_MODEL_92e050af8c674152b4d279fdda960458" + } + }, + "34569214f37b44e0802ddd403a4fa1a9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f00da7475754dad8d5d5ce800ae673d", + "placeholder": "​", + "style": "IPY_MODEL_1a9b0c9e9cb941fbb57529d10d054084", + "value": "100%" + } + }, + "2bdfcb2139bb4ae4b670c1fb49db6911": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_545d68c2b8b742da9dd7983a3321ebf1", + "max": 1716416, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8956f45722f841dab504335dcc1921bc", + "value": 1716416 + } + }, + "b27ff271d01a49e38a32cd3ddcebe77a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1785191337814317989989f0800e1424", + "placeholder": "​", + "style": "IPY_MODEL_d25bea1e539c4d328f05ded6af8b8154", + "value": " 1.72M/1.72M [00:00<00:00, 9.13MiB/s]" + } + }, + "92e050af8c674152b4d279fdda960458": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f00da7475754dad8d5d5ce800ae673d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1a9b0c9e9cb941fbb57529d10d054084": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "545d68c2b8b742da9dd7983a3321ebf1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8956f45722f841dab504335dcc1921bc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1785191337814317989989f0800e1424": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d25bea1e539c4d328f05ded6af8b8154": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bcf5338c159d4d53bc3e39e717885292": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7276e9b8b79d41ab991296d44521e2ee", + "IPY_MODEL_f2ab86c6413649a595edf4b8feeb6a66", + "IPY_MODEL_decc923894ab4ef4b35fa627b1b2dfb4" + ], + "layout": "IPY_MODEL_9875e07950114beca6741a51962c1f30" + } + }, + "7276e9b8b79d41ab991296d44521e2ee": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_90db1bae92c94a49ab8bc9c12073e552", + "placeholder": "​", + "style": "IPY_MODEL_92a42873a8ef4c00ad24ef47a62b4093", + "value": "Optimization Progress: 100%" + } + }, + "f2ab86c6413649a595edf4b8feeb6a66": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1826f0b9003a4f8bb1c074f526560c86", + "max": 120, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_549831432f244f5080c74eb842bae875", + "value": 120 + } + }, + "decc923894ab4ef4b35fa627b1b2dfb4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_20be31d5be524d76bc2dbb3baad4edf3", + "placeholder": "​", + "style": "IPY_MODEL_fc71d71e57774cb1a0df69e82d3e7c2d", + "value": " 120/120 [02:00<00:00,  1.16pipeline/s]" + } + }, + "9875e07950114beca6741a51962c1f30": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "90db1bae92c94a49ab8bc9c12073e552": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "92a42873a8ef4c00ad24ef47a62b4093": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1826f0b9003a4f8bb1c074f526560c86": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "549831432f244f5080c74eb842bae875": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "20be31d5be524d76bc2dbb3baad4edf3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fc71d71e57774cb1a0df69e82d3e7c2d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8a807c2780cf4a65ad92dfa1ead0e439": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_021bf8d789b74e3d899a99eeb82a0036", + "IPY_MODEL_f7aeed7c0692419591ca57bffee51739", + "IPY_MODEL_d7f2da7e3beb42dfaf31dc69eb7a597e" + ], + "layout": "IPY_MODEL_c7b913c695fc448c8fc7b31f7098f636" + } + }, + "021bf8d789b74e3d899a99eeb82a0036": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a59325353a1d4902b03f5a37c0293934", + "placeholder": "​", + "style": "IPY_MODEL_6cd3d0fc462c4b14bbe6be92b8e6103e", + "value": "100%" + } + }, + "f7aeed7c0692419591ca57bffee51739": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0cfdc6a2fa49499e9f0ce350b88dfbc4", + "max": 3288938, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a9c70dae97424ca290a75e10d24baf52", + "value": 3288938 + } + }, + "d7f2da7e3beb42dfaf31dc69eb7a597e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f0fc3fee01b04ffc922096ee18b6cb0f", + "placeholder": "​", + "style": "IPY_MODEL_9142d6e9793648a9909e5ce3dabf415b", + "value": " 3.29M/3.29M [00:00<00:00, 9.24MiB/s]" + } + }, + "c7b913c695fc448c8fc7b31f7098f636": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a59325353a1d4902b03f5a37c0293934": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6cd3d0fc462c4b14bbe6be92b8e6103e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0cfdc6a2fa49499e9f0ce350b88dfbc4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9c70dae97424ca290a75e10d24baf52": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f0fc3fee01b04ffc922096ee18b6cb0f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9142d6e9793648a9909e5ce3dabf415b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2380e728c4b94d4a82d99c5df53bcfb8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_60e0fda3b0b64788b12de0492e48a8eb", + "IPY_MODEL_b4aab6b20b704970a6d88b4203aafe33", + "IPY_MODEL_94853d6fcd9e485ab00941270501bdc2" + ], + "layout": "IPY_MODEL_85171f4f5f6f4cf798c8f0afc85c0f45" + } + }, + "60e0fda3b0b64788b12de0492e48a8eb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_188af8de0a894c4b862c04ae22f6e2ab", + "placeholder": "​", + "style": "IPY_MODEL_cca3270cffa747b0819b1a50fff1ef02", + "value": "Optimization Progress: 100%" + } + }, + "b4aab6b20b704970a6d88b4203aafe33": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2f02cde9cf5f45fc8ce05a545baad40c", + "max": 20, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_aa35eac1e7964054bf8ab8903cd3e182", + "value": 20 + } + }, + "94853d6fcd9e485ab00941270501bdc2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bf41ac8c2a6f47e6b6956e4389485e3e", + "placeholder": "​", + "style": "IPY_MODEL_6f7d22c33bbd434e83d5764c7ebed942", + "value": " 120/120 [01:48<00:00,  1.01s/pipeline]" + } + }, + "85171f4f5f6f4cf798c8f0afc85c0f45": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "188af8de0a894c4b862c04ae22f6e2ab": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cca3270cffa747b0819b1a50fff1ef02": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2f02cde9cf5f45fc8ce05a545baad40c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aa35eac1e7964054bf8ab8903cd3e182": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bf41ac8c2a6f47e6b6956e4389485e3e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f7d22c33bbd434e83d5764c7ebed942": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "612cffdae98544d480553f9a40e6bd01": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9eb0c8bff0b6480caac00ebc3381b0d7", + "IPY_MODEL_ef6da18b47a04feeabf4d5e1d0437f0b", + "IPY_MODEL_a992cbe30d1c47b09724593a260a690c" + ], + "layout": "IPY_MODEL_ee97630f2ff341bfb51cedf7c99bf39c" + } + }, + "9eb0c8bff0b6480caac00ebc3381b0d7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c89e1c3487e040caa253aad97f240fe7", + "placeholder": "​", + "style": "IPY_MODEL_31666166fb934ef4b8d568f609db9fb3", + "value": "Optimization Progress: 100%" + } + }, + "ef6da18b47a04feeabf4d5e1d0437f0b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ee32d42a6124f9fa35a04115268427d", + "max": 20, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d78b5fb09a4d4aacb52940e2a12de183", + "value": 20 + } + }, + "a992cbe30d1c47b09724593a260a690c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d8a982183d3f45d2b77dddd74f4ff9e7", + "placeholder": "​", + "style": "IPY_MODEL_f679dcba359e460ebe3d763e086922c7", + "value": " 120/120 [01:46<00:00,  1.14pipeline/s]" + } + }, + "ee97630f2ff341bfb51cedf7c99bf39c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "c89e1c3487e040caa253aad97f240fe7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "31666166fb934ef4b8d568f609db9fb3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2ee32d42a6124f9fa35a04115268427d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d78b5fb09a4d4aacb52940e2a12de183": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d8a982183d3f45d2b77dddd74f4ff9e7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f679dcba359e460ebe3d763e086922c7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e329ae9f00ae426caba593bf82062720": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6b18e9941ad8489a856d8951086dc0ec", + "IPY_MODEL_bd1ef0b1216f44e981e06617353af203", + "IPY_MODEL_cb16a7ccc2d048699ec81bd9cc08782b" + ], + "layout": "IPY_MODEL_2057161353254330b31e15c030f5c77e" + } + }, + "6b18e9941ad8489a856d8951086dc0ec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2768fc55816a4ddabd3389b3e6c9a0df", + "placeholder": "​", + "style": "IPY_MODEL_12c31e7d69934b1bb13b7e4de18f9aee", + "value": "Optimization Progress: 100%" + } + }, + "bd1ef0b1216f44e981e06617353af203": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d3f693b6862747fbab9f786e7ef0a9cf", + "max": 40, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f3a099a3356044e7a641d590c6381041", + "value": 40 + } + }, + "cb16a7ccc2d048699ec81bd9cc08782b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aab64f46dfdf4948bec48ef02618cf75", + "placeholder": "​", + "style": "IPY_MODEL_b83434e8778040d4975f5fab89a976ec", + "value": " 40/40 [00:09<00:00,  5.26pipeline/s]" + } + }, + "2057161353254330b31e15c030f5c77e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "2768fc55816a4ddabd3389b3e6c9a0df": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "12c31e7d69934b1bb13b7e4de18f9aee": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d3f693b6862747fbab9f786e7ef0a9cf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f3a099a3356044e7a641d590c6381041": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "aab64f46dfdf4948bec48ef02618cf75": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b83434e8778040d4975f5fab89a976ec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3ae62907413b4d969f53c4a3092fb46d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5af08e85e25040b19544493df13617d5", + "IPY_MODEL_439b9e942cec4fe699f3e561082585b5", + "IPY_MODEL_b4b43f188f454890a04bc1869680c46d" + ], + "layout": "IPY_MODEL_4de8382a838648b6bf2d639b5ae65c7b" + } + }, + "5af08e85e25040b19544493df13617d5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_32aa5ea1c8354c0eb3538819a48e81ea", + "placeholder": "​", + "style": "IPY_MODEL_ed0c20dcb4764e629766a2e31d019edf", + "value": "Optimization Progress: 100%" + } + }, + "439b9e942cec4fe699f3e561082585b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_90db1881f3254f71adf5d809c0d83bae", + "max": 40, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_87b95caad65649a9b85bdc015c665838", + "value": 40 + } + }, + "b4b43f188f454890a04bc1869680c46d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8151a7980fc346debb0caf1c20af5429", + "placeholder": "​", + "style": "IPY_MODEL_2f6c4ac937de426ab94fdde3fc4fc95a", + "value": " 40/40 [00:04<00:00,  7.79pipeline/s]" + } + }, + "4de8382a838648b6bf2d639b5ae65c7b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "32aa5ea1c8354c0eb3538819a48e81ea": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ed0c20dcb4764e629766a2e31d019edf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "90db1881f3254f71adf5d809c0d83bae": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "87b95caad65649a9b85bdc015c665838": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8151a7980fc346debb0caf1c20af5429": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f6c4ac937de426ab94fdde3fc4fc95a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# AutoML with TPOT using Adversary Emulation (C2) as datagen - Supervised and Deep Learning with Genetic Programming" + ], + "metadata": { + "id": "YJeseKzwjOVm" + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "wUrhwXsDkETs" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Installations\n", + "\n", + "The installation is mostly automated.\n", + "\n", + "A file in the same directory named \"thesis_ro\" will be required, which should contain your GitHub read-only token.\n", + "\n", + "The file has one line:\n", + "\n", + "\n", + "`echo \"GITHUB_PERSONAL_ACCESS_TOKEN=\"ghp_...\" > thesis_ro`" + ], + "metadata": { + "id": "Fv4KCLz9j4Zc" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "ayxWRSxzgwh_" + }, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import subprocess\n", + "\n", + "IN_COLAB = 'google.colab' in sys.modules\n", + "\n", + "if not IN_COLAB:\n", + " pass\n", + "\n", + "else:\n", + " subprocess.run('''\n", + " source <(curl -s https://raw.githubusercontent.com/norandom/log2ml/main/dependencies/install.sh)\n", + " ''',\n", + " shell=True, check=True, executable='/bin/bash')\n", + "\n" + ] + }, + { + "cell_type": "code", + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "load_dotenv(\"thesis_ro\", verbose=True) # take environment variables from the file\n", + "token = os.getenv('GITHUB_PERSONAL_ACCESS_TOKEN')\n", + "if len(token) > 0:\n", + " print(\"ok\")\n", + "else:\n", + " print(\"no token\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vyKWa35bkFcG", + "outputId": "61b4bff8-193b-40f0-fb90-d1d1a8c2010c" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ok\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Data download: captured Sysmon logs from the AE lab\n", + "\n", + "These samples contain Sysmon log activity of Dropper Malware (C2 Dropper, MS Excel VBA, Covenant).\n", + "\n", + "No AE campaigns, just the Dropper itself.\n", + "\n", + "We are looking at 1000 documents, some malicious and some not. Which ones are malicious? How does the VBA Excel malware behave? How not? Can ML help to find out?" + ], + "metadata": { + "id": "oYVNy4rNojZc" + } + }, + { + "cell_type": "code", + "source": [ + "from github import Github\n", + "import requests\n", + "from tqdm.notebook import tqdm\n", + "\n", + "\n", + "def get_specific_file_from_tagged_release(token, repo_name, tag_name, filename):\n", + " g = Github(token)\n", + " repo = g.get_repo(repo_name)\n", + " releases = repo.get_releases()\n", + "\n", + " for release in releases:\n", + " if release.tag_name == tag_name:\n", + " for asset in release.get_assets():\n", + " if asset.name == filename:\n", + " return asset.url\n", + " print(\"File not found. Try get_specific_file_from_latest_release() instead.\")\n", + " return None\n", + "\n", + "def get_specific_file_from_latest_release(token, repo_name, filename):\n", + " g = Github(token)\n", + " repo = g.get_repo(repo_name)\n", + " release = repo.get_latest_release()\n", + "\n", + " for asset in release.get_assets():\n", + " if asset.name == filename:\n", + " return asset.url # Use asset.url which points to API URL needing headers\n", + "\n", + "def download_file(url, token, save_path):\n", + " headers = {'Authorization': f'token {token}', 'Accept': 'application/octet-stream'}\n", + " # First request to handle GitHub's redirection and authentication properly\n", + " with requests.get(url, headers=headers, stream=True) as initial_response:\n", + " initial_response.raise_for_status() # Ensure the initial request is successful\n", + " # Follow redirection if necessary, maintaining headers\n", + " if initial_response.history:\n", + " url = initial_response.url # Updated URL after redirection\n", + "\n", + " # Now, proceed with downloading the file\n", + " with requests.get(url, headers=headers, stream=True) as response:\n", + " response.raise_for_status()\n", + " total_size_in_bytes = int(response.headers.get('content-length', 0))\n", + " block_size = 1024\n", + "\n", + " progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)\n", + " with open(save_path, 'wb') as file:\n", + " for data in response.iter_content(block_size):\n", + " progress_bar.update(len(data))\n", + " file.write(data)\n", + " progress_bar.close()\n", + "\n", + " if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:\n", + " print(\"ERROR, something went wrong\")\n", + " else:\n", + " print(f\"File downloaded successfully and saved as {save_path}\")\n", + "\n", + "# Your GitHub token\n", + "github_token = token\n", + "\n", + "# Repository name\n", + "repository_name = \"norandom/log2ml\"\n", + "\n", + "# File name to search for\n", + "file_name = \"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024.csv\"\n", + "\n", + "# Get the download URL of the specific file\n", + "# download_url = get_specific_file_from_latest_release(github_token, repository_name, file_name)\n", + "download_url = get_specific_file_from_tagged_release(github_token, repository_name, \"lab\", file_name)\n", + "print(download_url)\n", + "\n", + "if download_url:\n", + " local_file_path = file_name\n", + " download_file(download_url, github_token, local_file_path)\n", + "else:\n", + " print(\"File not found.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85, + "referenced_widgets": [ + "13c7730f61b24661bca4b4406f488fbb", + "82838b6e6693488682c97ce82ed723bf", + "41b566d613234e03876a7c6a68431d7b", + "dd83f610ca584acbba3738f2c137b2b9", + "ad4f22541b284758b868c95e5e8ceff0", + "13d9c546446a482ab6634e2c69c70685", + "10c31ef1ded9471cae583be49e7092e5", + "c1ad9d79f8464e71bd79ae848c103a58", + "0de9cbe7986c49fb99ae4f3abdba42d9", + "6b12aceb2e4c46eeb708d0877a9ca639", + "b6edc45065c7488a8b8be5e1349b27ba" + ] + }, + "id": "5EEoa3gUmFpn", + "outputId": "83f18cff-926c-467d-dc4e-089e69342895" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "https://api.github.com/repos/norandom/log2ml/releases/assets/182477524\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0.00/8.04M [00:00 1:\n", + " processed_lines.append(f\"User: {parts[1]}\")\n", + " elif keyword in ['SourceHostname', 'DestinationHostname']:\n", + " parts = line.split(':', 1)\n", + " if len(parts) > 1:\n", + " hostname = parts[1].strip().split('.')[0]\n", + " processed_lines.append(f\"{keyword}: {hostname}\")\n", + " elif keyword == 'QueryName':\n", + " parts = line.split(':', 1)\n", + " if len(parts) > 1:\n", + " domain = parts[1].strip().split('.')\n", + " if len(domain) > 0:\n", + " processed_lines.append(f\"QueryName: {domain[0]}\")\n", + " else:\n", + " processed_lines.append(line)\n", + " processed = True\n", + " break\n", + " if not processed:\n", + " processed_lines.append(line)\n", + "\n", + " return '\\n'.join(processed_lines)\n", + "\n", + " return batch.map_elements(modify_message, return_dtype=pl.Utf8)\n", + "\n", + "# Keywords to filter or process\n", + "keywords_to_filter = [\"UtcTime\", \"SourceProcessGUID\", \"ProcessGuid\", \"TargetProcessGUID\", \"TargetObject\", \"FileVersion\", \"Hashes\", \"LogonGuid\", \"LogonId\", \"CreationUtcTime\", \"User\", \"ParentProcessGuid\", \"SourceHostname\", \"DestinationHostname\", \"QueryName\"]\n", + "\n", + "# Apply the transformation to the 'message' column using map_batches\n", + "df_f = df.with_columns(\n", + " pl.col(\"message\").map_batches(lambda batch: remove_keyword_lines(batch, keywords_to_filter), return_dtype=pl.Utf8).alias(\"filtered_message\")\n", + ")\n", + "\n", + "# Show the DataFrame to confirm it's loaded correctly\n", + "print(df_f)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_CJUqGQUqFew", + "outputId": "8f117d78-c99e-4c42-83ac-796ba6a2389a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "shape: (13_455, 8)\n", + "┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐\n", + "│ @timestamp ┆ host.hostn ┆ host.ip ┆ log.level ┆ winlog.ev ┆ winlog.ta ┆ message ┆ filtered_ │\n", + "│ --- ┆ ame ┆ --- ┆ --- ┆ ent_id ┆ sk ┆ --- ┆ message │\n", + "│ str ┆ --- ┆ str ┆ str ┆ --- ┆ --- ┆ str ┆ --- │\n", + "│ ┆ str ┆ ┆ ┆ i64 ┆ str ┆ ┆ str │\n", + "╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 3 ┆ Network ┆ Network ┆ Network │\n", + "│ T15:08:24. ┆ ┆ :35de:6006 ┆ on ┆ ┆ connectio ┆ connectio ┆ connectio │\n", + "│ 277Z ┆ ┆ :d4cf ┆ ┆ ┆ n ┆ n ┆ n │\n", + "│ ┆ ┆ ┆ ┆ ┆ detected ┆ detected: ┆ detected: │\n", + "│ ┆ ┆ ┆ ┆ ┆ (rul… ┆ Rul… ┆ Rul… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 3 ┆ Network ┆ Network ┆ Network │\n", + "│ T15:08:24. ┆ ┆ :35de:6006 ┆ on ┆ ┆ connectio ┆ connectio ┆ connectio │\n", + "│ 488Z ┆ ┆ :d4cf ┆ ┆ ┆ n ┆ n ┆ n │\n", + "│ ┆ ┆ ┆ ┆ ┆ detected ┆ detected: ┆ detected: │\n", + "│ ┆ ┆ ┆ ┆ ┆ (rul… ┆ Rul… ┆ Rul… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 10 ┆ Process ┆ Process ┆ Process │\n", + "│ T15:08:25. ┆ ┆ :35de:6006 ┆ on ┆ ┆ accessed ┆ accessed: ┆ accessed: │\n", + "│ 005Z ┆ ┆ :d4cf ┆ ┆ ┆ (rule: ┆ RuleName: ┆ RuleName: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ProcessA… ┆ - ┆ - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ Ut… ┆ Ut… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 10 ┆ Process ┆ Process ┆ Process │\n", + "│ T15:08:25. ┆ ┆ :35de:6006 ┆ on ┆ ┆ accessed ┆ accessed: ┆ accessed: │\n", + "│ 005Z ┆ ┆ :d4cf ┆ ┆ ┆ (rule: ┆ RuleName: ┆ RuleName: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ProcessA… ┆ - ┆ - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ Ut… ┆ Ut… │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 10 ┆ Process ┆ Process ┆ Process │\n", + "│ T23:35:53. ┆ ┆ :35de:6006 ┆ on ┆ ┆ accessed ┆ accessed: ┆ accessed: │\n", + "│ 054Z ┆ ┆ :d4cf ┆ ┆ ┆ (rule: ┆ RuleName: ┆ RuleName: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ProcessA… ┆ - ┆ - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ Ut… ┆ Ut… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 10 ┆ Process ┆ Process ┆ Process │\n", + "│ T23:35:54. ┆ ┆ :35de:6006 ┆ on ┆ ┆ accessed ┆ accessed: ┆ accessed: │\n", + "│ 133Z ┆ ┆ :d4cf ┆ ┆ ┆ (rule: ┆ RuleName: ┆ RuleName: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ProcessA… ┆ - ┆ - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ Ut… ┆ Ut… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 10 ┆ Process ┆ Process ┆ Process │\n", + "│ T23:35:54. ┆ ┆ :35de:6006 ┆ on ┆ ┆ accessed ┆ accessed: ┆ accessed: │\n", + "│ 133Z ┆ ┆ :d4cf ┆ ┆ ┆ (rule: ┆ RuleName: ┆ RuleName: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ProcessA… ┆ - ┆ - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ Ut… ┆ Ut… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informati ┆ 1 ┆ Process ┆ Process ┆ Process │\n", + "│ T23:41:55. ┆ ┆ :35de:6006 ┆ on ┆ ┆ Create ┆ Create: ┆ Create: │\n", + "│ 301Z ┆ ┆ :d4cf ┆ ┆ ┆ (rule: ┆ RuleName: ┆ RuleName: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ProcessCr ┆ - ┆ - │\n", + "│ ┆ ┆ ┆ ┆ ┆ e… ┆ UtcT… ┆ UtcT… │\n", + "└────────────┴────────────┴────────────┴───────────┴───────────┴───────────┴───────────┴───────────┘\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import re\n", + "\n", + "# Extract relevant information using regular expressions\n", + "def extract_info(text):\n", + " image = re.search(r\"Image: (.*?\\.exe)\", text, re.IGNORECASE)\n", + " target_filename = re.search(r\"TargetFilename: (.*?\\.exe)\", text, re.IGNORECASE)\n", + " parent_image = re.search(r\"ParentImage: (.*?\\.exe)\", text, re.IGNORECASE)\n", + "\n", + " return {\n", + " \"image\": image.group(1) if image else \"\",\n", + " \"target_filename\": target_filename.group(1) if target_filename else \"\",\n", + " \"parent_image\": parent_image.group(1).split(\"\\\\\")[-1] if parent_image else \"\",\n", + " \"text\": text\n", + " }\n", + "\n", + "# Apply extraction to the Polars DataFrame using map_elements\n", + "df_f = df_f.with_columns(\n", + " pl.col(\"filtered_message\").map_elements(lambda x: extract_info(x), return_dtype=pl.Object).alias(\"extracted_info\")\n", + ")\n", + "\n", + "# Extract fields from the extracted_info column using map_elements with return_dtype\n", + "df_f = df_f.with_columns(\n", + " pl.col(\"extracted_info\").map_elements(lambda x: x['image'], return_dtype=pl.Utf8).alias(\"image\"),\n", + " pl.col(\"extracted_info\").map_elements(lambda x: x['target_filename'], return_dtype=pl.Utf8).alias(\"target_filename\"),\n", + " pl.col(\"extracted_info\").map_elements(lambda x: x['parent_image'], return_dtype=pl.Utf8).alias(\"parent_image\"),\n", + " pl.col(\"extracted_info\").map_elements(lambda x: x['text'], return_dtype=pl.Utf8).alias(\"text\")\n", + ").drop(\"extracted_info\")\n", + "\n", + "print(df_f.head())\n", + "\n", + "# Print the unique values in the parent_image column\n", + "print(df_f[\"parent_image\"].value_counts())\n", + "print(df_f[\"target_filename\"].value_counts())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WBy3Rqj_orz_", + "outputId": "aa6e5749-f0ac-4505-a9ea-f7a1488362f5" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "shape: (5, 12)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ @timestam ┆ host.host ┆ host.ip ┆ log.level ┆ … ┆ image ┆ target_fi ┆ parent_im ┆ text │\n", + "│ p ┆ name ┆ --- ┆ --- ┆ ┆ --- ┆ lename ┆ age ┆ --- │\n", + "│ --- ┆ --- ┆ str ┆ str ┆ ┆ str ┆ --- ┆ --- ┆ str │\n", + "│ str ┆ str ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ 2024-07-2 ┆ win10 ┆ fe80::c1a ┆ informati ┆ … ┆ C:\\Window ┆ ┆ ┆ Network │\n", + "│ 8T15:08:2 ┆ ┆ f:35de:60 ┆ on ┆ ┆ s\\System3 ┆ ┆ ┆ connecti │\n", + "│ 4.277Z ┆ ┆ 06:d4cf ┆ ┆ ┆ 2\\svchost ┆ ┆ ┆ on detec │\n", + "│ ┆ ┆ ┆ ┆ ┆ .exe ┆ ┆ ┆ ted: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Rul… │\n", + "│ 2024-07-2 ┆ win10 ┆ fe80::c1a ┆ informati ┆ … ┆ C:\\Window ┆ ┆ ┆ Network │\n", + "│ 8T15:08:2 ┆ ┆ f:35de:60 ┆ on ┆ ┆ s\\System3 ┆ ┆ ┆ connecti │\n", + "│ 4.488Z ┆ ┆ 06:d4cf ┆ ┆ ┆ 2\\svchost ┆ ┆ ┆ on detec │\n", + "│ ┆ ┆ ┆ ┆ ┆ .exe ┆ ┆ ┆ ted: │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Rul… │\n", + "│ 2024-07-2 ┆ win10 ┆ fe80::c1a ┆ informati ┆ … ┆ C:\\Window ┆ ┆ ┆ Process │\n", + "│ 8T15:08:2 ┆ ┆ f:35de:60 ┆ on ┆ ┆ s\\system3 ┆ ┆ ┆ accessed │\n", + "│ 5.005Z ┆ ┆ 06:d4cf ┆ ┆ ┆ 2\\svchost ┆ ┆ ┆ : │\n", + "│ ┆ ┆ ┆ ┆ ┆ .exe ┆ ┆ ┆ RuleName │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ : - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Ut… │\n", + "│ 2024-07-2 ┆ win10 ┆ fe80::c1a ┆ informati ┆ … ┆ C:\\Window ┆ ┆ ┆ Process │\n", + "│ 8T15:08:2 ┆ ┆ f:35de:60 ┆ on ┆ ┆ s\\system3 ┆ ┆ ┆ accessed │\n", + "│ 5.005Z ┆ ┆ 06:d4cf ┆ ┆ ┆ 2\\svchost ┆ ┆ ┆ : │\n", + "│ ┆ ┆ ┆ ┆ ┆ .exe ┆ ┆ ┆ RuleName │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ : - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Ut… │\n", + "│ 2024-07-2 ┆ win10 ┆ fe80::c1a ┆ informati ┆ … ┆ C:\\Window ┆ ┆ ┆ Process │\n", + "│ 8T15:08:2 ┆ ┆ f:35de:60 ┆ on ┆ ┆ s\\system3 ┆ ┆ ┆ accessed │\n", + "│ 5.030Z ┆ ┆ 06:d4cf ┆ ┆ ┆ 2\\svchost ┆ ┆ ┆ : │\n", + "│ ┆ ┆ ┆ ┆ ┆ .exe ┆ ┆ ┆ RuleName │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ : - │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Ut… │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n", + "shape: (41, 2)\n", + "┌───────────────────────────────────┬───────┐\n", + "│ parent_image ┆ count │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞═══════════════════════════════════╪═══════╡\n", + "│ msiexec.exe ┆ 2 │\n", + "│ CompatTelRunner.exe ┆ 2 │\n", + "│ msedge.exe ┆ 127 │\n", + "│ AvEmUpdate.exe ┆ 6 │\n", + "│ … ┆ … │\n", + "│ MicrosoftEdge_X64_127.0.2651.74_… ┆ 1 │\n", + "│ ┆ 11637 │\n", + "│ runonce.exe ┆ 1 │\n", + "│ PLUGScheduler.exe ┆ 2 │\n", + "└───────────────────────────────────┴───────┘\n", + "shape: (91, 2)\n", + "┌───────────────────────────────────┬───────┐\n", + "│ target_filename ┆ count │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞═══════════════════════════════════╪═══════╡\n", + "│ C:\\Users\\student\\AppData\\Local\\M… ┆ 2 │\n", + "│ C:\\ProgramData\\Microsoft\\ClickTo… ┆ 1 │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", + "│ C:\\Program Files\\WindowsApps\\Mic… ┆ 1 │\n", + "│ … ┆ … │\n", + "│ C:\\Program Files (x86)\\Microsoft… ┆ 1 │\n", + "│ C:\\Users\\student\\AppData\\Local\\M… ┆ 1 │\n", + "│ C:\\Program Files\\WindowsApps\\Mic… ┆ 1 │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", + "└───────────────────────────────────┴───────┘\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def is_temp_folder(path):\n", + " if not path:\n", + " return False\n", + "\n", + " # Convert path to lowercase for case-insensitive comparison\n", + " lower_path = path.lower()\n", + "\n", + " # Check if the path contains \"\\temp\\\" or ends with \"\\temp\"\n", + " if \"\\temp\\\\\" in lower_path or lower_path.endswith(\"\\\\temp\"):\n", + " return True\n", + "\n", + " # Check for specific temp folder patterns\n", + " temp_patterns = [\n", + " r\"c:\\windows\\temp\",\n", + " r\"c:\\users\\*\\appdata\\local\\temp\",\n", + " r\"c:\\users\\*\\appdata\\locallow\\temp\",\n", + " r\"c:\\users\\*\\appdata\\roaming\\temp\",\n", + " r\"c:\\temp\",\n", + " r\"c:\\windows\\softwaredistriβution\\download\",\n", + " ]\n", + "\n", + " for pattern in temp_patterns:\n", + " if pattern.startswith(r\"c:\\users\\*\"):\n", + " # Replace the wildcard with the actual username\n", + " user_pattern = pattern.replace(\"*\", path.split(\"\\\\\")[2])\n", + " if lower_path.startswith(user_pattern):\n", + " return True\n", + " elif lower_path.startswith(pattern):\n", + " return True\n", + "\n", + " return False\n", + "\n", + "def get_filename(path):\n", + " return path.split(\"\\\\\")[-1] if path else \"\"\n", + "\n", + "# Add new columns to the DataFrame in a single operation\n", + "df_f = df_f.with_columns([\n", + " pl.col(\"target_filename\").map_elements(lambda x: \"Yes\" if is_temp_folder(x) else \"No\").alias(\"temp_folder\"),\n", + " pl.col(\"target_filename\").map_elements(get_filename).alias(\"filename\")\n", + "])\n", + "\n", + "# Print the first few rows where temp_folder is \"Yes\"\n", + "print(df_f.filter(pl.col(\"temp_folder\") == \"Yes\").select([\"target_filename\", \"temp_folder\", \"filename\"]).head(10))\n", + "\n", + "# Print value counts for temp_folder column\n", + "print(df_f[\"temp_folder\"].value_counts())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b0wLxKfzsg1e", + "outputId": "10bc2daa-ad91-4e8e-e2d8-2b941e670935" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "shape: (10, 3)\n", + "┌───────────────────────────────────┬─────────────┬────────────────────────┐\n", + "│ target_filename ┆ temp_folder ┆ filename │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str │\n", + "╞═══════════════════════════════════╪═════════════╪════════════════════════╡\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ file.exe │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ cli-32.exe │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ cli-64.exe │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ cli-arm64.exe │\n", + "│ … ┆ … ┆ … │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ gui-64.exe │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ gui-arm64.exe │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ gui.exe │\n", + "│ C:\\Users\\student\\AppData\\Local\\T… ┆ Yes ┆ wininst-10.0-amd64.exe │\n", + "└───────────────────────────────────┴─────────────┴────────────────────────┘\n", + "shape: (2, 2)\n", + "┌─────────────┬───────┐\n", + "│ temp_folder ┆ count │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞═════════════╪═══════╡\n", + "│ No ┆ 13289 │\n", + "│ Yes ┆ 166 │\n", + "└─────────────┴───────┘\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "count = df_f.filter((pl.col(\"temp_folder\") == \"Yes\") & pl.col(\"filename\").str.contains(\"file.exe\")).height\n", + "print(count)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uj7NV2wGPHi2", + "outputId": "44158419-7cda-4720-d26c-104caec20093" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "114\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Store the data as CSV and JSON" + ], + "metadata": { + "id": "BGPg1RDmxMVE" + } + }, + { + "cell_type": "code", + "source": [ + "df_f.write_csv(\"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024_filtered.csv\", include_header=True)" + ], + "metadata": { + "id": "ngGJUIsXu5fL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "\n", + "# Convert DataFrame to JSON, line by line\n", + "json_lines = [json.dumps(record) for record in df.to_dicts()]\n", + "\n", + "# Append each line to an existing JSON file\n", + "with open(\"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024_filtered.json\", 'a') as file:\n", + " for line in json_lines:\n", + " file.write(line + '\\n') # Append each line and add a newline" + ], + "metadata": { + "id": "lRQS5xLTvpZv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Labeling based on Python" + ], + "metadata": { + "id": "GgFNrU80xQsF" + } + }, + { + "cell_type": "code", + "source": [ + "def define_label(row):\n", + " conditions = {\n", + " (\"EXCEL.EXE\" in row['image'] and \".exe\" in row['target_filename'] and row['temp_folder'] == \"Yes\"): \"bad\",\n", + " (row['parent_image'] == \"EXCEL.EXE\" and row['temp_folder'] == \"Yes\" and row['image'].lower().endswith('.exe')): \"bad\",\n", + " # Add more conditions here if needed\n", + " }\n", + " return conditions.get(True, \"good\")\n", + "\n", + "# Apply the label to the DataFrame\n", + "df_f = df_f.with_columns(pl.struct(df_f.columns).map_elements(define_label).alias(\"label\"))\n", + "\n", + "# Print the first few rows where the label is \"bad\"\n", + "print(df_f.filter(pl.col(\"label\") == \"bad\").select([\"image\", \"parent_image\", \"target_filename\", \"temp_folder\", \"label\"]).head(10))\n", + "\n", + "# Print value counts for the label column\n", + "print(df_f[\"label\"].value_counts())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "X5ptzPnfxP9M", + "outputId": "4ac38968-70f5-4b1a-f956-65118365efdd" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "shape: (10, 5)\n", + "┌──────────────────────────────┬──────────────┬──────────────────────────────┬─────────────┬───────┐\n", + "│ image ┆ parent_image ┆ target_filename ┆ temp_folder ┆ label │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str ┆ str │\n", + "╞══════════════════════════════╪══════════════╪══════════════════════════════╪═════════════╪═══════╡\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "│ C:\\Program Files\\Microsoft ┆ ┆ C:\\Users\\student\\AppData\\Loc ┆ Yes ┆ bad │\n", + "│ Offic… ┆ ┆ al\\T… ┆ ┆ │\n", + "└──────────────────────────────┴──────────────┴──────────────────────────────┴─────────────┴───────┘\n", + "shape: (2, 2)\n", + "┌───────┬───────┐\n", + "│ label ┆ count │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞═══════╪═══════╡\n", + "│ good ┆ 13341 │\n", + "│ bad ┆ 114 │\n", + "└───────┴───────┘\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Filter the DataFrame for rows where label is \"bad\" and select specified columns\n", + "bad_df = df_f.filter(pl.col(\"label\") == \"bad\").select([\n", + " \"image\", \"parent_image\", \"filename\", \"temp_folder\", \"label\"\n", + "])\n", + "\n", + "# Write the filtered DataFrame to a CSV file\n", + "bad_df.write_csv(\"bad.csv\")\n", + "\n", + "# Print the first 10 rows of the bad DataFrame\n", + "print(bad_df.head(10))\n", + "\n", + "# Print the total count of \"bad\" rows\n", + "total_bad_count = bad_df.shape[0]\n", + "print(f\"\\nTotal number of 'bad' rows: {total_bad_count}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dRH3AReO_i5D", + "outputId": "f3bfd34d-eb82-4d8e-f73d-d0305d003da6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "shape: (10, 5)\n", + "┌───────────────────────────────────┬──────────────┬──────────┬─────────────┬───────┐\n", + "│ image ┆ parent_image ┆ filename ┆ temp_folder ┆ label │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str ┆ str │\n", + "╞═══════════════════════════════════╪══════════════╪══════════╪═════════════╪═══════╡\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "│ C:\\Program Files\\Microsoft Offic… ┆ ┆ file.exe ┆ Yes ┆ bad │\n", + "└───────────────────────────────────┴──────────────┴──────────┴─────────────┴───────┘\n", + "\n", + "Total number of 'bad' rows: 114\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Vectorization with Linformer" + ], + "metadata": { + "id": "9-1dkT2mQHvJ" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "torch.cuda.empty_cache()" + ], + "metadata": { + "id": "_Vgi0rjn_RIn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Linformer parameter 1:1 from the researchers\n", + "# then the input_size is determined based on the max no. of tokens\n", + "# the positional embedding flag is on by default\n", + "\n", + "from linformer_pytorch import LinformerLM\n", + "import torch\n", + "\n", + "linformer_model = LinformerLM(\n", + " num_tokens=30000, # Number of tokens in the LM\n", + " input_size=700, # Dimension 1 of the input\n", + " channels=64, # Dimension 2 of the input\n", + " dim_d=None, # Overwrites the inner dim of the attention heads. If None, sticks with the recommended channels // nhead, as in the \"Attention is all you need\" paper\n", + " dim_k=128, # The second dimension of the P_bar matrix from the paper\n", + " dim_ff=128, # Dimension in the feed forward network\n", + " dropout_ff=0.15, # Dropout for feed forward network\n", + " nhead=4, # Number of attention heads\n", + " depth=2, # How many times to run the model\n", + " dropout=0.1, # How much dropout to apply to P_bar after softmax\n", + " activation=\"gelu\", # What activation to use. Currently, only gelu and relu supported, and only on ff network.\n", + " checkpoint_level=\"C0\", # What checkpoint level to use. For more information, see below.\n", + " parameter_sharing=\"layerwise\", # What level of parameter sharing to use. For more information, see below.\n", + " k_reduce_by_layer=0, # Going down `depth`, how much to reduce `dim_k` by, for the `E` and `F` matrices. Will have a minimum value of 1.\n", + " full_attention=False, # Use full attention instead, for O(n^2) time and space complexity. Included here just for comparison\n", + " include_ff=True, # Whether or not to include the Feed Forward layer\n", + " w_o_intermediate_dim=None, # If not None, have 2 w_o matrices, such that instead of `dim*nead,channels`, you have `dim*nhead,w_o_int`, and `w_o_int,channels`\n", + " emb_dim=128, # If you want the embedding dimension to be different than the channels for the Linformer\n", + " causal=False, # If you want this to be a causal Linformer, where the upper right of the P_bar matrix is masked out.\n", + " method=\"learnable\", # The method of how to perform the projection. Supported methods are 'convolution', 'learnable', and 'no_params'\n", + " ff_intermediate=None, # See the section below for more information\n", + " ).cuda()" + ], + "metadata": { + "id": "SsW6vlQNQhAy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Vectorize text column in the DataFrame" + ], + "metadata": { + "id": "h4cuBLWrSNVs" + } + }, + { + "cell_type": "code", + "source": [ + "print(df_f.columns)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4SguqPPKSdpJ", + "outputId": "9e9a39cb-10b8-44fb-bc22-65deafe45989" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['@timestamp', 'host.hostname', 'host.ip', 'log.level', 'winlog.event_id', 'winlog.task', 'message', 'filtered_message', 'image', 'target_filename', 'parent_image', 'text', 'temp_folder', 'filename', 'label']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from tokenizers import Tokenizer\n", + "import torch\n", + "import numpy as np\n", + "import polars as pl\n", + "\n", + "# Load the custom tokenizer\n", + "tokenizer = Tokenizer.from_file(\"log_tokenizer.json\")\n", + "\n", + "# Define the device (assuming you're using PyTorch and want to specify CPU or GPU)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "def vectorize_text(text):\n", + " MAX_LENGTH = 700 # Define the maximum length of tokens for the model\n", + "\n", + " # Tokenize using the custom tokenizer\n", + " encoded = tokenizer.encode(text)\n", + "\n", + " # Get token IDs\n", + " input_ids = encoded.ids\n", + "\n", + " # Ensure the input_ids length is exactly MAX_LENGTH\n", + " input_ids = input_ids[:MAX_LENGTH] if len(input_ids) > MAX_LENGTH else input_ids + [0] * (MAX_LENGTH - len(input_ids))\n", + "\n", + " # Convert to PyTorch tensor and move to the appropriate device\n", + " input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)\n", + "\n", + " # Get the model outputs, ensuring the input tensor is the correct size\n", + " outputs = linformer_model(input_ids) # Now passing the tensor directly\n", + "\n", + " # Assuming outputs is the tensor of interest\n", + " vector = outputs.mean(dim=1).detach() # Detach the tensor from the GPU\n", + " return vector.cpu().numpy() # Move tensor back to CPU and convert to numpy\n", + "\n", + "# Assuming `better_columns_df` is a Polars DataFrame with a column \"filtered_message\"\n", + "df_f = df_f.with_columns(\n", + " pl.col(\"filtered_message\").map_elements(lambda x: vectorize_text(x).flatten(), return_dtype=pl.Object).alias(\"message_vector\")\n", + ")\n", + "\n", + "print(df_f)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EyvdFj83SQKI", + "outputId": "2482d1f0-3f47-41e4-df13-5f73325e5e80" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "shape: (13_455, 16)\n", + "┌────────────┬────────────┬────────────┬────────────┬───┬───────────┬──────────┬───────┬───────────┐\n", + "│ @timestamp ┆ host.hostn ┆ host.ip ┆ log.level ┆ … ┆ temp_fold ┆ filename ┆ label ┆ message_v │\n", + "│ --- ┆ ame ┆ --- ┆ --- ┆ ┆ er ┆ --- ┆ --- ┆ ector │\n", + "│ str ┆ --- ┆ str ┆ str ┆ ┆ --- ┆ str ┆ str ┆ --- │\n", + "│ ┆ str ┆ ┆ ┆ ┆ str ┆ ┆ ┆ object │\n", + "╞════════════╪════════════╪════════════╪════════════╪═══╪═══════════╪══════════╪═══════╪═══════════╡\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.45116 │\n", + "│ T15:08:24. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 934 0.01 │\n", + "│ 277Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 940297 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.4095… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.45242 │\n", + "│ T15:08:24. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 962 0.02 │\n", + "│ 488Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 170923 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.3832… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.37145 │\n", + "│ T15:08:25. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 707 0.04 │\n", + "│ 005Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 775189 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.2652… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.34181 │\n", + "│ T15:08:25. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 97 0.04 │\n", + "│ 005Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 779522 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.2722… │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.35288 │\n", + "│ T23:35:53. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 972 0.03 │\n", + "│ 054Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 555745 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.2832… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.37340 │\n", + "│ T23:35:54. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 525 0.03 │\n", + "│ 133Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 428246 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.2805… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.36035 │\n", + "│ T23:35:54. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 54 0.03 │\n", + "│ 133Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 481918 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.2802… │\n", + "│ 2024-07-28 ┆ win10 ┆ fe80::c1af ┆ informatio ┆ … ┆ No ┆ ┆ good ┆ [-0.41050 │\n", + "│ T23:41:55. ┆ ┆ :35de:6006 ┆ n ┆ ┆ ┆ ┆ ┆ 297 0.01 │\n", + "│ 301Z ┆ ┆ :d4cf ┆ ┆ ┆ ┆ ┆ ┆ 828452 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -0.3112… │\n", + "└────────────┴────────────┴────────────┴────────────┴───┴───────────┴──────────┴───────┴───────────┘\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import polars as pl\n", + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "import numpy as np\n", + "\n", + "# Print the column names and data types\n", + "print(\"Column names and data types:\")\n", + "for col in df_f.columns:\n", + " print(f\"{col}: {df_f[col].dtype}\")\n", + "\n", + "# Create PyArrow arrays for each column\n", + "pa_arrays = []\n", + "pa_field_names = []\n", + "\n", + "for col_name in df_f.columns:\n", + " col_data = df_f[col_name].to_list()\n", + "\n", + " if df_f[col_name].dtype == pl.Object:\n", + " # For Object dtype, we'll create a list of float64 arrays\n", + " try:\n", + " pa_array = pa.list_(pa.float64()).from_pandas(col_data)\n", + " except:\n", + " # If conversion fails, store as string\n", + " pa_array = pa.array([str(x) for x in col_data])\n", + " else:\n", + " pa_array = pa.array(col_data)\n", + "\n", + " pa_arrays.append(pa_array)\n", + " pa_field_names.append(col_name)\n", + "\n", + "# Create PyArrow table\n", + "pa_table = pa.Table.from_arrays(pa_arrays, names=pa_field_names)\n", + "\n", + "# Write the PyArrow table to Parquet\n", + "pq.write_table(pa_table, \"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024_filtered_vectors.parquet\")\n", + "\n", + "print(\"Parquet file written successfully.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RWRNV5iCTPHB", + "outputId": "ce896f59-95ef-4794-efb1-a50357ff6292" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Column names and data types:\n", + "@timestamp: Datetime(time_unit='us', time_zone='UTC')\n", + "host.hostname: Utf8\n", + "host.ip: Utf8\n", + "log.level: Utf8\n", + "winlog.event_id: Int64\n", + "winlog.task: Utf8\n", + "message: Utf8\n", + "filtered_message: Utf8\n", + "image: Utf8\n", + "target_filename: Utf8\n", + "parent_image: Utf8\n", + "text: Utf8\n", + "temp_folder: Utf8\n", + "filename: Utf8\n", + "label: Utf8\n", + "message_vector_list: Object\n", + "Parquet file written successfully.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pyarrow.parquet as pq\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "\n", + "# Read the Parquet file\n", + "table = pq.read_table(\"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024_filtered_vectors.parquet\")\n", + "print(\"Parquet file read successfully\")\n", + "\n", + "# Convert to pandas DataFrame\n", + "df = table.to_pandas()\n", + "print(\"Converted to pandas DataFrame successfully\")\n", + "\n", + "# Function to convert string representation of array to numpy array\n", + "def string_to_array(s):\n", + " # Remove square brackets\n", + " s = s.strip('[]')\n", + " # Split by whitespace, handling the '...' case\n", + " nums = re.split(r'\\s+', s)\n", + " # Convert to float, ignoring '...' and empty strings\n", + " return np.array([float(num) for num in nums if num not in ['...', '']])\n", + "\n", + "# Convert message_vector_list to numpy arrays\n", + "try:\n", + " df['message_vector'] = df['message_vector_list'].apply(string_to_array)\n", + " print(\"Converted message_vector_list to numpy arrays successfully\")\n", + "except Exception as e:\n", + " print(f\"Error converting message_vector_list to numpy arrays: {e}\")\n", + " # Print a few examples of the problematic data\n", + " print(df['message_vector_list'].head())\n", + " raise\n", + "\n", + "# Drop the original message_vector_list column if you don't need it\n", + "df = df.drop(columns=['message_vector_list'])\n", + "\n", + "print(\"Final DataFrame columns:\")\n", + "print(df.columns)\n", + "\n", + "# Print the shape of the first few message vectors to verify\n", + "print(\"Shape of first few message vectors:\")\n", + "print(df['message_vector'].head().apply(lambda x: x.shape))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6eQyNWIAW-ts", + "outputId": "7aa26ca7-cbe6-4ee2-982f-e219a01dda18" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Parquet file read successfully\n", + "Converted to pandas DataFrame successfully\n", + "Converted message_vector_list to numpy arrays successfully\n", + "Final DataFrame columns:\n", + "Index(['@timestamp', 'host.hostname', 'host.ip', 'log.level',\n", + " 'winlog.event_id', 'winlog.task', 'message', 'filtered_message',\n", + " 'image', 'target_filename', 'parent_image', 'text', 'temp_folder',\n", + " 'filename', 'label', 'message_vector'],\n", + " dtype='object')\n", + "Shape of first few message vectors:\n", + "0 (6,)\n", + "1 (6,)\n", + "2 (6,)\n", + "3 (6,)\n", + "4 (6,)\n", + "Name: message_vector, dtype: object\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# AutoML with TPOT (Supervised Learning)" + ], + "metadata": { + "id": "ZqGIRBU6aE3U" + } + }, + { + "cell_type": "code", + "source": [ + "print(\"Polars Df\")\n", + "print(df_f.head())\n", + "print(df_f.schema)\n", + "print()\n", + "print(\"Pandas Df\")\n", + "print(df.info())\n", + "print(df.dtypes)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 236 + }, + "id": "9JjYJzacaD-T", + "outputId": "3f5a9112-2530-4f39-bd8b-8e11d0edb51d" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Polars Df\n" + ] + }, + { + "output_type": "error", + "ename": "NameError", + "evalue": "name 'df_f' is not defined", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Polars Df\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pandas Df\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'df_f' is not defined" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from tpot import TPOTClassifier\n", + "from sklearn.metrics import f1_score\n", + "import numpy as np\n", + "import pandas as pd\n", + "from collections import defaultdict\n", + "import time\n", + "import re\n", + "from tabulate import tabulate\n", + "\n", + "\n", + "# Assuming df is already loaded and contains 'message_vector' and 'label' columns\n", + "\n", + "# Encode labels\n", + "le = LabelEncoder()\n", + "df['label_encoded'] = le.fit_transform(df['label'])\n", + "\n", + "# Split data\n", + "X = np.array(df['message_vector'].tolist())\n", + "y = df['label_encoded'].values\n", + "\n", + "# Initialize results storage\n", + "results = defaultdict(list)\n", + "\n", + "# Number of runs\n", + "n_runs = 1\n", + "\n", + "# Function to extract number of features selected\n", + "def get_n_features(pipeline_str):\n", + " match = re.search(r'SelectPercentile\\(score_func=f_classif, percentile=(\\d+)\\)', pipeline_str)\n", + " if match:\n", + " percentile = int(match.group(1))\n", + " return int(X.shape[1] * percentile / 100)\n", + " return X.shape[1] # If no feature selection, return all features\n", + "\n", + "# Initialize best_tpot and best_f1\n", + "best_tpot = None\n", + "best_f1 = 0\n", + "\n", + "for run in range(n_runs):\n", + " print(f\"\\nStarting run {run + 1}/{n_runs}\")\n", + " start_time = time.time()\n", + "\n", + " # Split data for this run\n", + " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 + run)\n", + "\n", + " # TPOT classifier with f1 score as the metric\n", + " tpot = TPOTClassifier(\n", + " scoring='f1_weighted', # Use weighted F1 score for multi-class problems\n", + " verbosity=2,\n", + " generations=5,\n", + " population_size=20,\n", + " random_state=42 + run\n", + " )\n", + "\n", + " # Fit\n", + " tpot.fit(X_train, y_train)\n", + "\n", + " # Predict and calculate F1 score\n", + " y_pred = tpot.predict(X_test)\n", + " f1 = f1_score(y_test, y_pred, average='weighted')\n", + "\n", + " # Update best_tpot if this run has better f1 score\n", + " if f1 > best_f1:\n", + " best_f1 = f1\n", + " best_tpot = tpot\n", + "\n", + " # Get pipeline string and extract number of features\n", + " pipeline_str = str(tpot.fitted_pipeline_)\n", + " n_features = get_n_features(pipeline_str)\n", + "\n", + " # Store results\n", + " results['run'].append(run + 1)\n", + " results['f1_score'].append(f1)\n", + " results['best_pipeline'].append(pipeline_str)\n", + " results['n_features'].append(n_features)\n", + " results['runtime'].append(time.time() - start_time)\n", + " results['pipelines_tested'].append(tpot.evaluated_individuals_)\n", + "\n", + " print(f\"Run {run + 1} completed. F1 Score: {f1:.4f}, Features selected: {n_features}, Pipelines tested: {len(tpot.evaluated_individuals_)}\")\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 289, + "referenced_widgets": [ + "bcf5338c159d4d53bc3e39e717885292", + "7276e9b8b79d41ab991296d44521e2ee", + "f2ab86c6413649a595edf4b8feeb6a66", + "decc923894ab4ef4b35fa627b1b2dfb4", + "9875e07950114beca6741a51962c1f30", + "90db1bae92c94a49ab8bc9c12073e552", + "92a42873a8ef4c00ad24ef47a62b4093", + "1826f0b9003a4f8bb1c074f526560c86", + "549831432f244f5080c74eb842bae875", + "20be31d5be524d76bc2dbb3baad4edf3", + "fc71d71e57774cb1a0df69e82d3e7c2d" + ] + }, + "id": "usxaQgkSbEND", + "outputId": "1f3cbeaa-d197-4877-b4d1-3f0f7d035f27" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Starting run 1/1\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Optimization Progress: 0%| | 0/120 [00:00" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "from collections import defaultdict\n", + "import re\n", + "from tabulate import tabulate\n", + "\n", + "# Assuming results_df is your DataFrame with all the run results\n", + "\n", + "def simplify_pipeline(pipeline_str):\n", + " # Extract the main classifier name\n", + " classifier = re.search(r\"(\\w+)\\(\", pipeline_str).group(1)\n", + " return classifier.lower()\n", + "\n", + "# Initialize a dictionary to store all pipeline statistics\n", + "all_pipeline_stats = defaultdict(int)\n", + "\n", + "# Iterate through all runs and all evaluated pipelines\n", + "for _, row in results_df.iterrows():\n", + " for pipeline_str in row['pipelines_tested']:\n", + " simple_pipeline = simplify_pipeline(pipeline_str)\n", + " all_pipeline_stats[simple_pipeline] += 1\n", + "\n", + "# Prepare the summary table\n", + "summary_table = [[pipeline, count] for pipeline, count in all_pipeline_stats.items()]\n", + "\n", + "# Sort by count\n", + "summary_table.sort(key=lambda x: -x[1])\n", + "\n", + "# Print the summary table\n", + "print(\"\\nTop 10 Pipelines Across All Runs and Evaluations:\")\n", + "print(tabulate(summary_table[:10], headers=['Pipeline', 'Count'], tablefmt='grid'))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9qalTrUsqpVD", + "outputId": "ab89ae33-96cb-41ad-83ab-2d0f7d21a7fe" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Top 10 Pipelines Across All Runs and Evaluations:\n", + "+------------------------+---------+\n", + "| Pipeline | Count |\n", + "+========================+=========+\n", + "| xgbclassifier | 44 |\n", + "+------------------------+---------+\n", + "| extratreesclassifier | 20 |\n", + "+------------------------+---------+\n", + "| bernoullinb | 13 |\n", + "+------------------------+---------+\n", + "| decisiontreeclassifier | 12 |\n", + "+------------------------+---------+\n", + "| gaussiannb | 7 |\n", + "+------------------------+---------+\n", + "| kneighborsclassifier | 7 |\n", + "+------------------------+---------+\n", + "| randomforestclassifier | 6 |\n", + "+------------------------+---------+\n", + "| mlpclassifier | 3 |\n", + "+------------------------+---------+\n", + "| sgdclassifier | 3 |\n", + "+------------------------+---------+\n", + "| multinomialnb | 1 |\n", + "+------------------------+---------+\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# File name to search for\n", + "file_name = \"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024_filtered_vectors.parquet\"\n", + "\n", + "# Get the download URL of the specific file\n", + "# download_url = get_specific_file_from_latest_release(github_token, repository_name, file_name)\n", + "download_url = get_specific_file_from_tagged_release(github_token, repository_name, \"lab\", file_name)\n", + "print(download_url)\n", + "\n", + "if download_url:\n", + " local_file_path = file_name\n", + " download_file(download_url, github_token, local_file_path)\n", + "else:\n", + " print(\"File not found.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85, + "referenced_widgets": [ + "8a807c2780cf4a65ad92dfa1ead0e439", + "021bf8d789b74e3d899a99eeb82a0036", + "f7aeed7c0692419591ca57bffee51739", + "d7f2da7e3beb42dfaf31dc69eb7a597e", + "c7b913c695fc448c8fc7b31f7098f636", + "a59325353a1d4902b03f5a37c0293934", + "6cd3d0fc462c4b14bbe6be92b8e6103e", + "0cfdc6a2fa49499e9f0ce350b88dfbc4", + "a9c70dae97424ca290a75e10d24baf52", + "f0fc3fee01b04ffc922096ee18b6cb0f", + "9142d6e9793648a9909e5ce3dabf415b" + ] + }, + "id": "SGH18PwBSdeE", + "outputId": "6bb659aa-bdf4-49fc-ad4a-c9f8968ce3c5" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "https://api.github.com/repos/norandom/log2ml/releases/assets/182698628\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0.00/3.29M [00:00 0]\n", + "\n", + "# Sort by count\n", + "summary_table.sort(key=lambda x: -x[1])\n", + "\n", + "# Print the summary table\n", + "print(\"\\nEstimated Algorithms Used Across All Runs:\")\n", + "print(tabulate(summary_table, headers=['Algorithm', 'Estimated Count'], tablefmt='grid'))\n", + "\n", + "# Calculate and print additional statistics\n", + "total_runs = len(results_df)\n", + "successful_runs = results_df['f1_score'].notna().sum()\n", + "average_f1 = results_df['f1_score'].mean()\n", + "average_features = results_df['n_features'].mean() if 'n_features' in results_df.columns else None\n", + "average_runtime = results_df['runtime'].mean()\n", + "\n", + "print(f\"\\nTotal Runs: {total_runs}\")\n", + "print(f\"Successful Runs: {successful_runs}\")\n", + "print(f\"Average F1 Score: {average_f1:.4f}\")\n", + "if average_features is not None:\n", + " print(f\"Average Number of Features: {average_features:.2f}\")\n", + "print(f\"Average Runtime: {average_runtime:.2f} seconds\")\n", + "print(f\"Total Pipelines Tested: {total_pipelines}\")\n", + "\n", + "# Print full best pipelines for each run\n", + "print(\"\\nBest Pipelines for Each Run:\")\n", + "for index, row in results_df.iterrows():\n", + " print(f\"\\nRun {index + 1}:\")\n", + " print(f\"F1 Score: {row['f1_score']:.4f}\")\n", + " print(f\"Pipeline: {row['best_pipeline']}\")\n", + " print(f\"Pipelines tested in this run: {row['pipelines_tested']}\")\n", + "\n", + "# Optional: Add visualization\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Visualize F1 scores across runs\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(results_df['run'], results_df['f1_score'], marker='o')\n", + "plt.title('F1 Score Across Runs')\n", + "plt.xlabel('Run')\n", + "plt.ylabel('F1 Score')\n", + "plt.grid(True)\n", + "plt.savefig('f1_scores_across_runs.png')\n", + "plt.close()\n", + "\n", + "# Visualize pipeline counts\n", + "algorithms, counts = zip(*summary_table)\n", + "plt.figure(figsize=(12, 6))\n", + "plt.bar(algorithms, counts)\n", + "plt.title('Estimated Algorithm Usage')\n", + "plt.xlabel('Algorithm')\n", + "plt.ylabel('Estimated Count')\n", + "plt.xticks(rotation=45, ha='right')\n", + "plt.tight_layout()\n", + "plt.savefig('estimated_algorithm_usage.png')\n", + "plt.close()\n", + "\n", + "print(\"\\nVisualization plots saved as 'f1_scores_across_runs.png' and 'estimated_algorithm_usage.png'\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5m0mtcVZduol", + "outputId": "1d8b8679-4cbc-4574-dc35-6e7ab8b3d1cd" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Estimated Algorithms Used Across All Runs:\n", + "+----------------------+-------------------+\n", + "| Algorithm | Estimated Count |\n", + "+======================+===================+\n", + "| MLPClassifier | 45 |\n", + "+----------------------+-------------------+\n", + "| KNeighborsClassifier | 45 |\n", + "+----------------------+-------------------+\n", + "| XGBClassifier | 45 |\n", + "+----------------------+-------------------+\n", + "| LinearSVC | 45 |\n", + "+----------------------+-------------------+\n", + "| SGDClassifier | 45 |\n", + "+----------------------+-------------------+\n", + "| Other | 6 |\n", + "+----------------------+-------------------+\n", + "\n", + "Total Runs: 2\n", + "Successful Runs: 2\n", + "Average F1 Score: 0.9872\n", + "Average Number of Features: 5.00\n", + "Average Runtime: 109.13 seconds\n", + "Total Pipelines Tested: 229\n", + "\n", + "Best Pipelines for Each Run:\n", + "\n", + "Run 1:\n", + "F1 Score: 0.9872\n", + "Pipeline: Pipeline(steps=[('gaussiannb', GaussianNB())])\n", + "Pipelines tested in this run: 115\n", + "\n", + "Run 2:\n", + "F1 Score: 0.9872\n", + "Pipeline: Pipeline(steps=[('extratreesclassifier',\n", + " ExtraTreesClassifier(criterion='entropy',\n", + " max_features=0.8500000000000001,\n", + " min_samples_leaf=20, min_samples_split=19,\n", + " random_state=43))])\n", + "Pipelines tested in this run: 114\n", + "\n", + "Visualization plots saved as 'f1_scores_across_runs.png' and 'estimated_algorithm_usage.png'\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "\n", + "# Assuming results_df is your DataFrame with the results\n", + "\n", + "fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 18))\n", + "\n", + "ax1.plot(results_df['run'], results_df['f1_score'], 'bo-')\n", + "ax1.set_title('TPOT-NN Performance (F1 Score) Across Runs')\n", + "ax1.set_xlabel('Run')\n", + "ax1.set_ylabel('F1 Score')\n", + "ax1.grid(True)\n", + "\n", + "ax2.plot(results_df['run'], results_df['n_features'], 'ro-')\n", + "ax2.set_title('Number of Features Selected Across Runs')\n", + "ax2.set_xlabel('Run')\n", + "ax2.set_ylabel('Number of Features')\n", + "ax2.grid(True)\n", + "\n", + "ax3.plot(results_df['run'], results_df['pipelines_tested'], 'go-')\n", + "ax3.set_title('Number of Pipelines Tested Across Runs')\n", + "ax3.set_xlabel('Run')\n", + "ax3.set_ylabel('Number of Pipelines')\n", + "ax3.grid(True)\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('tpot_nn_performance_features_pipelines.png')\n", + "plt.show()\n", + "\n", + "# Print summary statistics\n", + "print(\"\\nSummary Statistics:\")\n", + "print(results_df.describe())\n", + "\n", + "# Optional: If you want to see the correlation between different metrics\n", + "correlation_matrix = results_df[['f1_score', 'n_features', 'pipelines_tested', 'runtime']].corr()\n", + "print(\"\\nCorrelation Matrix:\")\n", + "print(correlation_matrix)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "3lBpziEae4sf", + "outputId": "08837957-1ab6-4acb-ab65-b0f0fca2b56c" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Summary Statistics:\n", + " run f1_score n_features runtime pipelines_tested\n", + "count 2.000000 2.000000 2.0 2.000000 2.000000\n", + "mean 1.500000 0.987198 5.0 109.132951 114.500000\n", + "std 0.707107 0.000000 0.0 1.063263 0.707107\n", + "min 1.000000 0.987198 5.0 108.381110 114.000000\n", + "25% 1.250000 0.987198 5.0 108.757030 114.250000\n", + "50% 1.500000 0.987198 5.0 109.132951 114.500000\n", + "75% 1.750000 0.987198 5.0 109.508871 114.750000\n", + "max 2.000000 0.987198 5.0 109.884791 115.000000\n", + "\n", + "Correlation Matrix:\n", + " f1_score n_features pipelines_tested runtime\n", + "f1_score NaN NaN NaN NaN\n", + "n_features NaN NaN NaN NaN\n", + "pipelines_tested NaN NaN 1.0 1.0\n", + "runtime NaN NaN 1.0 1.0\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Different Algorithm set" + ], + "metadata": { + "id": "QJQXi06FdUDN" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import QuantileTransformer, LabelEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from tpot import TPOTClassifier\n", + "from sklearn.metrics import f1_score\n", + "from collections import defaultdict\n", + "import time\n", + "import re\n", + "from tabulate import tabulate\n", + "\n", + "\n", + "\n", + "# Encode labels\n", + "le = LabelEncoder()\n", + "y_encoded = le.fit_transform(y)\n", + "\n", + "# Function to perform PCA\n", + "def perform_pca(X, transformer, n_components=0.95):\n", + " X_transformed = transformer.fit_transform(X)\n", + " pca = PCA(n_components=n_components)\n", + " X_pca = pca.fit_transform(X_transformed)\n", + " n_components_selected = X_pca.shape[1]\n", + " return X_pca, n_components_selected\n", + "\n", + "# Perform PCA with QuantileTransformer\n", + "transformer = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=42)\n", + "X_pca, n_components = perform_pca(X, transformer)\n", + "\n", + "print(f\"Number of components selected to explain 95% of variance: {n_components}\")\n", + "\n", + "# Initialize results storage\n", + "results = defaultdict(list)\n", + "\n", + "# Number of runs\n", + "n_runs = 2\n", + "\n", + "# Function to extract number of features selected\n", + "def get_n_features(pipeline_str):\n", + " match = re.search(r'SelectPercentile\\(score_func=f_classif, percentile=(\\d+)\\)', pipeline_str)\n", + " if match:\n", + " percentile = int(match.group(1))\n", + " return int(X_pca.shape[1] * percentile / 100)\n", + " return X_pca.shape[1] # If no feature selection, return all features\n", + "\n", + "for run in range(n_runs):\n", + " print(f\"\\nStarting run {run + 1}/{n_runs}\")\n", + " start_time = time.time()\n", + "\n", + " # Stratified split\n", + " X_train, X_test, y_train, y_test = train_test_split(X_pca, y_encoded, test_size=0.2, random_state=42 + run, stratify=y_encoded)\n", + "\n", + " # TPOT classifier with reduced complexity and CPU usage\n", + " tpot = TPOTClassifier(\n", + " config_dict='TPOT light', # Use a lighter configuration\n", + " scoring='f1_weighted',\n", + " verbosity=2,\n", + " generations=3,\n", + " population_size=10,\n", + " n_jobs=-1,\n", + " random_state=42 + run\n", + " )\n", + "\n", + " try:\n", + " # Fit\n", + " tpot.fit(X_train, y_train)\n", + "\n", + " # Predict and calculate F1 score\n", + " y_pred = tpot.predict(X_test)\n", + " f1 = f1_score(y_test, y_pred, average='weighted')\n", + "\n", + " # Get pipeline string and extract number of features\n", + " pipeline_str = str(tpot.fitted_pipeline_)\n", + " n_features = get_n_features(pipeline_str)\n", + "\n", + " # Store results\n", + " results['run'].append(run + 1)\n", + " results['f1_score'].append(f1)\n", + " results['best_pipeline'].append(pipeline_str)\n", + " results['n_features'].append(n_features)\n", + " results['runtime'].append(time.time() - start_time)\n", + " results['pipelines_tested'].append(len(tpot.evaluated_individuals_))\n", + "\n", + " print(f\"Run {run + 1} completed. F1 Score: {f1:.4f}, Features selected: {n_features}, Pipelines tested: {len(tpot.evaluated_individuals_)}\")\n", + "\n", + " except Exception as e:\n", + " print(f\"Error in run {run + 1}: {str(e)}\")\n", + " results['run'].append(run + 1)\n", + " results['f1_score'].append(None)\n", + " results['best_pipeline'].append(None)\n", + " results['n_features'].append(None)\n", + " results['runtime'].append(time.time() - start_time)\n", + " results['pipelines_tested'].append(None)\n", + "\n", + "# Print results table\n", + "print(\"\\nResults Summary:\")\n", + "print(tabulate(results, headers='keys', tablefmt='grid'))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 669, + "referenced_widgets": [ + "e329ae9f00ae426caba593bf82062720", + "6b18e9941ad8489a856d8951086dc0ec", + "bd1ef0b1216f44e981e06617353af203", + "cb16a7ccc2d048699ec81bd9cc08782b", + "2057161353254330b31e15c030f5c77e", + "2768fc55816a4ddabd3389b3e6c9a0df", + "12c31e7d69934b1bb13b7e4de18f9aee", + "d3f693b6862747fbab9f786e7ef0a9cf", + "f3a099a3356044e7a641d590c6381041", + "aab64f46dfdf4948bec48ef02618cf75", + "b83434e8778040d4975f5fab89a976ec", + "3ae62907413b4d969f53c4a3092fb46d", + "5af08e85e25040b19544493df13617d5", + "439b9e942cec4fe699f3e561082585b5", + "b4b43f188f454890a04bc1869680c46d", + "4de8382a838648b6bf2d639b5ae65c7b", + "32aa5ea1c8354c0eb3538819a48e81ea", + "ed0c20dcb4764e629766a2e31d019edf", + "90db1881f3254f71adf5d809c0d83bae", + "87b95caad65649a9b85bdc015c665838", + "8151a7980fc346debb0caf1c20af5429", + "2f6c4ac937de426ab94fdde3fc4fc95a" + ] + }, + "id": "pDB9efndcv9g", + "outputId": "922183ba-2287-40a7-8d73-32a476c0557f" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of components selected to explain 95% of variance: 5\n", + "\n", + "Starting run 1/2\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Optimization Progress: 0%| | 0/40 [00:00 0]\n", + "\n", + "# Sort by count\n", + "summary_table.sort(key=lambda x: -x[1])\n", + "\n", + "# Print the summary table\n", + "print(\"\\nEstimated Algorithms Used Across All Runs:\")\n", + "print(tabulate(summary_table, headers=['Algorithm', 'Estimated Count'], tablefmt='grid'))\n", + "\n", + "# Calculate and print additional statistics\n", + "total_runs = len(results_df)\n", + "successful_runs = results_df['f1_score'].notna().sum()\n", + "average_f1 = results_df['f1_score'].mean()\n", + "average_features = results_df['n_features'].mean()\n", + "average_runtime = results_df['runtime'].mean()\n", + "\n", + "print(f\"\\nTotal Runs: {total_runs}\")\n", + "print(f\"Successful Runs: {successful_runs}\")\n", + "print(f\"Average F1 Score: {average_f1:.4f}\")\n", + "print(f\"Average Number of Features: {average_features:.2f}\")\n", + "print(f\"Average Runtime: {average_runtime:.2f} seconds\")\n", + "print(f\"Total Pipelines Tested: {total_pipelines}\")\n", + "\n", + "# Print full best pipelines for each run\n", + "print(\"\\nBest Pipelines for Each Run:\")\n", + "for index, row in results_df.iterrows():\n", + " print(f\"\\nRun {index + 1}:\")\n", + " print(f\"F1 Score: {row['f1_score']:.4f}\")\n", + " print(f\"Pipeline: {row['best_pipeline']}\")\n", + " print(f\"Pipelines tested in this run: {row['pipelines_tested']}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VgKt6AU28-6f", + "outputId": "ab7671a3-1229-42f5-807a-eec12327733a" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Estimated Algorithms Used Across All Runs:\n", + "+----------------------------+-------------------+\n", + "| Algorithm | Estimated Count |\n", + "+============================+===================+\n", + "| LogisticRegression | 38 |\n", + "+----------------------------+-------------------+\n", + "| KNeighborsClassifier | 38 |\n", + "+----------------------------+-------------------+\n", + "| DecisionTreeClassifier | 38 |\n", + "+----------------------------+-------------------+\n", + "| RandomForestClassifier | 38 |\n", + "+----------------------------+-------------------+\n", + "| GradientBoostingClassifier | 38 |\n", + "+----------------------------+-------------------+\n", + "| MLPClassifier | 38 |\n", + "+----------------------------+-------------------+\n", + "| Other | 3 |\n", + "+----------------------------+-------------------+\n", + "\n", + "Total Runs: 2\n", + "Successful Runs: 2\n", + "Average F1 Score: 0.9872\n", + "Average Number of Features: 5.00\n", + "Average Runtime: 109.13 seconds\n", + "Total Pipelines Tested: 229\n", + "\n", + "Best Pipelines for Each Run:\n", + "\n", + "Run 1:\n", + "F1 Score: 0.9872\n", + "Pipeline: Pipeline(steps=[('gaussiannb', GaussianNB())])\n", + "Pipelines tested in this run: 115\n", + "\n", + "Run 2:\n", + "F1 Score: 0.9872\n", + "Pipeline: Pipeline(steps=[('extratreesclassifier',\n", + " ExtraTreesClassifier(criterion='entropy',\n", + " max_features=0.8500000000000001,\n", + " min_samples_leaf=20, min_samples_split=19,\n", + " random_state=43))])\n", + "Pipelines tested in this run: 114\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "\n", + "# Assuming results_df is your DataFrame with the results\n", + "\n", + "fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 18))\n", + "\n", + "ax1.plot(results_df['run'], results_df['f1_score'], 'bo-')\n", + "ax1.set_title('TPOT-NN Performance (F1 Score) Across Runs')\n", + "ax1.set_xlabel('Run')\n", + "ax1.set_ylabel('F1 Score')\n", + "ax1.grid(True)\n", + "\n", + "ax2.plot(results_df['run'], results_df['n_features'], 'ro-')\n", + "ax2.set_title('Number of Features Selected Across Runs')\n", + "ax2.set_xlabel('Run')\n", + "ax2.set_ylabel('Number of Features')\n", + "ax2.grid(True)\n", + "\n", + "ax3.plot(results_df['run'], results_df['pipelines_tested'], 'go-')\n", + "ax3.set_title('Number of Pipelines Tested Across Runs')\n", + "ax3.set_xlabel('Run')\n", + "ax3.set_ylabel('Number of Pipelines')\n", + "ax3.grid(True)\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('tpot_nn_performance_features_pipelines.png')\n", + "plt.show()\n", + "\n", + "# Print summary statistics\n", + "print(\"\\nSummary Statistics:\")\n", + "print(results_df.describe())\n", + "\n", + "# Optional: If you want to see the correlation between different metrics\n", + "correlation_matrix = results_df[['f1_score', 'n_features', 'pipelines_tested', 'runtime']].corr()\n", + "print(\"\\nCorrelation Matrix:\")\n", + "print(correlation_matrix)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "SYRHdR4_YKXA", + "outputId": "a25b02c4-ae41-4ef9-c701-a77d227b7ab9" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Summary Statistics:\n", + " run f1_score n_features runtime pipelines_tested\n", + "count 2.000000 2.000000 2.0 2.000000 2.000000\n", + "mean 1.500000 0.987198 5.0 109.132951 114.500000\n", + "std 0.707107 0.000000 0.0 1.063263 0.707107\n", + "min 1.000000 0.987198 5.0 108.381110 114.000000\n", + "25% 1.250000 0.987198 5.0 108.757030 114.250000\n", + "50% 1.500000 0.987198 5.0 109.132951 114.500000\n", + "75% 1.750000 0.987198 5.0 109.508871 114.750000\n", + "max 2.000000 0.987198 5.0 109.884791 115.000000\n", + "\n", + "Correlation Matrix:\n", + " f1_score n_features pipelines_tested runtime\n", + "f1_score NaN NaN NaN NaN\n", + "n_features NaN NaN NaN NaN\n", + "pipelines_tested NaN NaN 1.0 1.0\n", + "runtime NaN NaN 1.0 1.0\n" + ] + } + ] + } + ] +} \ No newline at end of file