diff --git a/Protoype_Alpha_BERT_Embedding_Sysmon.ipynb b/Protoype_Alpha_BERT_Embedding_Sysmon.ipynb new file mode 100644 index 0000000..cb779d5 --- /dev/null +++ b/Protoype_Alpha_BERT_Embedding_Sysmon.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "e2670aff-b454-404a-97b6-7c6603bf4599", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n", + "Requirement already satisfied: torch==2.2.1+cpu in /home/marius/anaconda3/lib/python3.11/site-packages (2.2.1+cpu)\n", + "Requirement already satisfied: filelock in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.9.0)\n", + "Requirement already satisfied: typing-extensions>=4.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (4.11.0)\n", + "Requirement already satisfied: sympy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (1.11.1)\n", + "Requirement already satisfied: networkx in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1)\n", + "Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1.2)\n", + "Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (2023.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch==2.2.1+cpu) (2.1.1)\n", + "Requirement already satisfied: mpmath>=0.19 in /home/marius/anaconda3/lib/python3.11/site-packages (from sympy->torch==2.2.1+cpu) (1.3.0)\n" + ] + } + ], + "source": [ + "!pip install torch==2.2.1+cpu -f https://download.pytorch.org/whl/torch_stable.html" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c0b722c7-8ed6-4aa9-83b5-c8e9295e49a2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: transformers\n", + "Version: 4.32.1\n", + "Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow\n", + "Home-page: https://github.com/huggingface/transformers\n", + "Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)\n", + "Author-email: transformers@huggingface.co\n", + "License: Apache 2.0 License\n", + "Location: /home/marius/anaconda3/lib/python3.11/site-packages\n", + "Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm\n", + "Required-by: \n" + ] + } + ], + "source": [ + "!pip install transformers==4.32.1" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c368bba3-4e7f-45a6-9883-637c3ad34515", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maximum length / number of tokens the model allows: 512\n", + "Number of tokens in the text: 301\n" + ] + } + ], + "source": [ + "from transformers import AutoTokenizer, AutoModel\n", + "import torch\n", + "\n", + "# Load tokenizer and model\n", + "tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')\n", + "model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')\n", + "\n", + "# Sample text\n", + "text = \"\"\"\n", + "\"File created:\n", + "RuleName: DLL\n", + "UtcTime: 2024-05-15 16:00:16.896\n", + "ProcessGuid: {18e8265a-da8c-6644-5a01-000000002700}\n", + "ProcessId: 6036\n", + "Image: C:\\Program Files (x86)\\Microsoft\\EdgeUpdate\\Install\\{D1058E28-B2C1-4930-8BC3-EA038942C727}\\EDGEMITMP_304C3.tmp\\setup.exe\n", + "TargetFilename: C:\\Program Files (x86)\\Microsoft\\EdgeCore\\124.0.2478.97\\onnxruntime.dll\n", + "CreationUtcTime: 2024-05-15 16:00:16.896\"\n", + "2024-05-15T16:00:15.887Z,win10,fe80::24b4:3691:44a6:38a1,information,5379,User Account Management,\"Credential Manager credentials were read.\n", + "\n", + "Subject:\n", + " Security ID: S-1-5-18\n", + " Account Name: WIN10$\n", + " Account Domain: sec699-20\n", + " Logon ID: 0x3E7\n", + " Read Operation: Enumerate Credentials\n", + "\n", + "This event occurs when a user performs a read operation on stored credentials in Credential Manager.\"\n", + "\"\"\"\n", + "\n", + "# Access the model's configuration\n", + "max_length = model.config.max_position_embeddings\n", + "print(\"Maximum length / number of tokens the model allows:\", max_length)\n", + "\n", + "# Tokenize the text\n", + "tokens = tokenizer.tokenize(text)\n", + "\n", + "# Count the number of tokens\n", + "num_tokens = len(tokens)\n", + "print(\"Number of tokens in the text:\", num_tokens)\n", + "\n", + "# Encode text\n", + "inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')\n", + "\n", + "# Get model output\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + "\n", + "# Extract embeddings\n", + "cls_embedding = outputs.last_hidden_state[:, 0, :] # Using the [CLS] token\n", + "\n", + "# For mean pooling\n", + "mean_embedding = outputs.last_hidden_state.mean(dim=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "fe0ac4ab-73dd-43e1-8fe5-795045d30e60", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (1, 1_024)\n", + "┌───────────┬───────────┬───────────┬──────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", + "│ column_0 ┆ column_1 ┆ column_2 ┆ column_3 ┆ … ┆ column_10 ┆ column_10 ┆ column_10 ┆ column_10 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ 20 ┆ 21 ┆ 22 ┆ 23 │\n", + "│ f32 ┆ f32 ┆ f32 ┆ f32 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ f32 ┆ f32 ┆ f32 ┆ f32 │\n", + "╞═══════════╪═══════════╪═══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ -0.208444 ┆ -0.327674 ┆ -0.032746 ┆ 0.461757 ┆ … ┆ 0.704677 ┆ -0.347134 ┆ -0.526733 ┆ -0.049645 │\n", + "└───────────┴───────────┴───────────┴──────────┴───┴───────────┴───────────┴───────────┴───────────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "# Convert the tensor to a Polars DataFrame\n", + "df = pl.DataFrame(cls_embedding.numpy())\n", + "\n", + "# Print the DataFrame\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "db92aa20-db10-42a7-bfe5-c640f73d9723", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 1024])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cls_embedding.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ad40cd5b-49db-40f1-ae59-682974031f3a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import torch\n", + "\n", + "# Example tensor, replace with cls_embedding\n", + "# cls_embedding = torch.randn(1, 768) # Simulating an embedding tensor\n", + "\n", + "# Plotting the tensor values\n", + "plt.figure(figsize=(10, 0.5))\n", + "plt.imshow(cls_embedding, aspect='auto', cmap='viridis')\n", + "plt.colorbar()\n", + "plt.title(\"Visualization of CLS Embedding\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e21e7fc-aaa3-4a34-9720-69ba897e06f5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}