log2ml/Protoype_Alpha_BERT_Embedding_Sysmon.ipynb

254 lines
27 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "e2670aff-b454-404a-97b6-7c6603bf4599",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
"Requirement already satisfied: torch==2.2.1+cpu in /home/marius/anaconda3/lib/python3.11/site-packages (2.2.1+cpu)\n",
"Requirement already satisfied: filelock in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.9.0)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (4.11.0)\n",
"Requirement already satisfied: sympy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (1.11.1)\n",
"Requirement already satisfied: networkx in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1)\n",
"Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1.2)\n",
"Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (2023.4.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch==2.2.1+cpu) (2.1.1)\n",
"Requirement already satisfied: mpmath>=0.19 in /home/marius/anaconda3/lib/python3.11/site-packages (from sympy->torch==2.2.1+cpu) (1.3.0)\n"
]
}
],
"source": [
"!pip install torch==2.2.1+cpu -f https://download.pytorch.org/whl/torch_stable.html"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c0b722c7-8ed6-4aa9-83b5-c8e9295e49a2",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Name: transformers\n",
"Version: 4.32.1\n",
"Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow\n",
"Home-page: https://github.com/huggingface/transformers\n",
"Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)\n",
"Author-email: transformers@huggingface.co\n",
"License: Apache 2.0 License\n",
"Location: /home/marius/anaconda3/lib/python3.11/site-packages\n",
"Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm\n",
"Required-by: \n"
]
}
],
"source": [
"!pip install transformers==4.32.1"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "c368bba3-4e7f-45a6-9883-637c3ad34515",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Maximum length / number of tokens the model allows: 512\n",
"Number of tokens in the text: 301\n"
]
}
],
"source": [
"from transformers import AutoTokenizer, AutoModel\n",
"import torch\n",
"\n",
"# Load tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')\n",
"model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')\n",
"\n",
"# Sample text\n",
"text = \"\"\"\n",
"\"File created:\n",
"RuleName: DLL\n",
"UtcTime: 2024-05-15 16:00:16.896\n",
"ProcessGuid: {18e8265a-da8c-6644-5a01-000000002700}\n",
"ProcessId: 6036\n",
"Image: C:\\Program Files (x86)\\Microsoft\\EdgeUpdate\\Install\\{D1058E28-B2C1-4930-8BC3-EA038942C727}\\EDGEMITMP_304C3.tmp\\setup.exe\n",
"TargetFilename: C:\\Program Files (x86)\\Microsoft\\EdgeCore\\124.0.2478.97\\onnxruntime.dll\n",
"CreationUtcTime: 2024-05-15 16:00:16.896\"\n",
"2024-05-15T16:00:15.887Z,win10,fe80::24b4:3691:44a6:38a1,information,5379,User Account Management,\"Credential Manager credentials were read.\n",
"\n",
"Subject:\n",
" Security ID: S-1-5-18\n",
" Account Name: WIN10$\n",
" Account Domain: sec699-20\n",
" Logon ID: 0x3E7\n",
" Read Operation: Enumerate Credentials\n",
"\n",
"This event occurs when a user performs a read operation on stored credentials in Credential Manager.\"\n",
"\"\"\"\n",
"\n",
"# Access the model's configuration\n",
"max_length = model.config.max_position_embeddings\n",
"print(\"Maximum length / number of tokens the model allows:\", max_length)\n",
"\n",
"# Tokenize the text\n",
"tokens = tokenizer.tokenize(text)\n",
"\n",
"# Count the number of tokens\n",
"num_tokens = len(tokens)\n",
"print(\"Number of tokens in the text:\", num_tokens)\n",
"\n",
"# Encode text\n",
"inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')\n",
"\n",
"# Get model output\n",
"with torch.no_grad():\n",
" outputs = model(**inputs)\n",
"\n",
"# Extract embeddings\n",
"cls_embedding = outputs.last_hidden_state[:, 0, :] # Using the [CLS] token\n",
"\n",
"# For mean pooling\n",
"mean_embedding = outputs.last_hidden_state.mean(dim=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "fe0ac4ab-73dd-43e1-8fe5-795045d30e60",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (1, 1_024)\n",
"┌───────────┬───────────┬───────────┬──────────┬───┬───────────┬───────────┬───────────┬───────────┐\n",
"│ column_0 ┆ column_1 ┆ column_2 ┆ column_3 ┆ … ┆ column_10 ┆ column_10 ┆ column_10 ┆ column_10 │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ 20 ┆ 21 ┆ 22 ┆ 23 │\n",
"│ f32 ┆ f32 ┆ f32 ┆ f32 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ ┆ ┆ ┆ ┆ ┆ f32 ┆ f32 ┆ f32 ┆ f32 │\n",
"╞═══════════╪═══════════╪═══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
"│ -0.208444 ┆ -0.327674 ┆ -0.032746 ┆ 0.461757 ┆ … ┆ 0.704677 ┆ -0.347134 ┆ -0.526733 ┆ -0.049645 │\n",
"└───────────┴───────────┴───────────┴──────────┴───┴───────────┴───────────┴───────────┴───────────┘\n"
]
}
],
"source": [
"import polars as pl\n",
"# Convert the tensor to a Polars DataFrame\n",
"df = pl.DataFrame(cls_embedding.numpy())\n",
"\n",
"# Print the DataFrame\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "db92aa20-db10-42a7-bfe5-c640f73d9723",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([1, 1024])"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cls_embedding.shape"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "ad40cd5b-49db-40f1-ae59-682974031f3a",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAvMAAABmCAYAAABGMR4OAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA1PklEQVR4nO3dd3wVVd4/8M+Zdu9N4ZJCCi2AD0jozQKikUWQ+qgsPtgQFNgFwaUsK1j2h6KIfdHXgtiAR8G2D0UERNmlWAgiTRAE2SVACIQS0pN775Tv74+5GbikkCAxJHzfr9e8XrkzZ2bOmXPmzPdO5p4RRERgjDHGGGOM1TpSTWeAMcYYY4wxdmk4mGeMMcYYY6yW4mCeMcYYY4yxWoqDecYYY4wxxmopDuYZY4wxxhirpTiYZ4wxxhhjrJbiYJ4xxhhjjLFaioN5xhhjjDHGaikO5hljjDHGGKulOJhn7Ap21113wePxICcnp9w0999/P1RVxcmTJ7Fo0SIIIXD48OHfLI9lOXz4MIQQWLRokTOvuvO2Zs0aPP3002Uua9asGUaOHFkt+71cdu7ciZSUFHi9XgghMGfOnArT5+XlYdasWejWrRvq1asHl8uFZs2a4eGHH8aOHTucdCXHfdu2bRVuLz09HY888ghatWoFj8eD6OhotG/fHmPGjEF6enqF627cuBFCiHKn89vBrzVy5EhERERctu1V5NZbb8Wtt9560XQl5d+4caMzb+TIkWjWrFm15Y0xxkooNZ0Bxlj5Ro0ahRUrVuDDDz/EI488Ump5bm4uli9fjkGDBiE+Ph4DBw5EamoqEhMTayC3FavuvK1ZswZz584tM6Bfvnw56tWrVy37vVwefvhhFBYW4uOPP0ZUVFSFgeB//vMf9O3bF6dOncLYsWPxzDPPICIiAocPH8ann36Krl27IicnB16vt1L7PnbsGLp06YL69evjz3/+M6699lrk5uZi3759+PTTT3Ho0CE0adLkott5/vnn0atXr1Lzr7nmmkrloy7561//iokTJ9Z0NhhjVwEO5hm7gvXv3x8NGzbEggULygzmP/roIxQXF2PUqFEAgAYNGqBBgwa/dTYrpSbz1rlz5xrZb1X89NNPGDNmDPr3719hOtM0cdddd+HMmTNITU1Fu3btnGUpKSkYMWIEvvjiC6iqWul9v/POOzhz5gy2bt2K5s2bO/PvvPNOPPHEE7Asq1LbadmyJW688cZK77cuuxq/wDDGagY/ZsPYFUyWZYwYMQLbt2/Hnj17Si1fuHAhEhMTnQCwrEdZdu7ciUGDBiEuLg4ulwsNGzbEwIEDcezYMQBlPxJTQggRcqf73//+Nx566CG0bNkSYWFhaNSoEQYPHlxm3i50Yd4qejTj/LvSn3zyCfr27YvExER4PB4kJydj+vTpKCwsdNKMHDkSc+fOdfJcMpXsq6zHbI4ePYoHHnjAOS7Jycl49dVXQwLXkmPzyiuv4LXXXkPz5s0RERGB7t27Y8uWLRctM2AH6XfccQeioqLgdrvRqVMn/O///m+p42IYBt58800n7+VZsWIF9uzZg8cffzwkkD9f//79ERYWVqn8AUBWVhYkSUJcXFyZyyXp8l0qmjVrhkGDBmHVqlXo3LmzU6erVq0CYB+P5ORkhIeH4/rrry/38aC9e/eid+/eCA8PR4MGDTBhwgQUFRWFpCEizJs3D506dYLH40FUVBSGDh2KQ4cOlUr30ksvISkpCW63G126dMEXX3xR5n7379+Pfv36ISwsDLGxsRg7dizy8/NLpSvrMRshBCZMmIAPPvgAycnJCAsLQ8eOHZ2yn++zzz5Dhw4d4HK50KJFC7z++ut4+umnK2wbjLGrEwfzjF3hHn74YQghsGDBgpD5+/btw9atWzFixAjIslzmuoWFhejTpw9OnjyJuXPnYt26dZgzZw6aNm1aZgByMcePH0dMTAxeeOEFrF27FnPnzoWiKLjhhhtw4MCBKm2rS5cuSE1NDZnef/99qKqKtm3bOukOHjyIAQMG4L333sPatWsxadIkfPrppxg8eLCT5q9//SuGDh0KACHbK++RntOnT6NHjx746quv8Oyzz2LlypW47bbbMHXqVEyYMKFU+vOP3ZIlS1BYWIgBAwYgNze3wjIeOHAAPXr0wN69e/HGG29g2bJlaNOmDUaOHImXXnoJwLnHjwBg6NChTt7L89VXXwGw75pfLt27d4dlWRgyZAi+/PJL5OXlXdJ2LMuCYRilpgv9+OOPePzxxzFt2jQsW7YMXq8XQ4YMwYwZM/Duu+/i+eefx5IlS5Cbm4tBgwahuLg4ZH1d1zFgwAD07t0bK1aswIQJE/DWW29h2LBhIen++Mc/YtKkSbjtttuwYsUKzJs3D3v37kWPHj1w8uRJJ90zzzyDadOmoU+fPlixYgXGjRuHMWPGlGrTJ0+eREpKCn766SfMmzcPH3zwAQoKCspsM+VZvXo1/v73v2PmzJlYunQpoqOjcdddd4V8wVi7di2GDBmCmJgYfPLJJ3jppZfw0UcfhXwJZIwxBzHGrngpKSkUGxtLgUDAmffnP/+ZANAvv/zizFu4cCEBoLS0NCIi2rZtGwGgFStWlLvttLQ0AkALFy4stQwAzZgxo9x1DcOgQCBALVu2pMmTJ1e4zQvzdqGTJ09SixYtqG3btpSdnV1mGsuySNd12rRpEwGgH3/80Vk2fvx4Kq9LS0pKohEjRjifp0+fTgDo+++/D0k3btw4EkLQgQMHQsrRvn17MgzDSbd161YCQB999FGZ+ytxzz33kMvloqNHj4bM79+/P4WFhVFOTo4zDwCNHz++wu0REfXr148AkM/nu2haonPH/Ycffig3jWVZ9Mc//pEkSSIAJISg5ORkmjx5crn1db4NGzYQgHKn9PR0J21SUhJ5PB46duyYM2/Xrl0EgBITE6mwsNCZv2LFCgJAK1eudOaNGDGCANDrr78ekodZs2YRAPr222+JiCg1NZUA0KuvvhqSLj09nTweDz322GNERJSdnU1ut5vuuuuukHTfffcdAaCUlBRn3rRp00gIQbt27QpJ26dPHwJAGzZsCMlnUlJSSDoAFB8fT3l5ec68zMxMkiSJZs+e7cy77rrrqEmTJuT3+515+fn5FBMTU24bZ4xdvfjOPGO1wKhRo3DmzBmsXLkSAGAYBhYvXoybb74ZLVu2LHe9//qv/0JUVBSmTZuG+fPnY9++fb8qH4Zh4Pnnn0ebNm2gaRoURYGmaTh48CB+/vnnS95uYWEhBg4cCJ/Phy+++AL169d3lh06dAj33XcfEhISIMsyVFVFSkoKAFzyPtevX482bdrg+uuvD5k/cuRIEBHWr18fMn/gwIEh//3o0KEDAODIkSMX3U/v3r1L/Xh05MiRKCoqqvAO/G9JCIH58+fj0KFDmDdvHh566CHouo6//e1vaNu2LTZt2lSp7bz44ov44YcfSk3x8fEh6Tp16oRGjRo5n5OTkwHYo8ec/3hQyfyyjvP9998f8vm+++4DAGzYsAEAsGrVKggh8MADD4T8lyAhIQEdO3Z0Rp5JTU2Fz+crtb0ePXogKSkpZN6GDRvQtm1bdOzYscx9V0avXr0QGRnpfI6Pj0dcXJxTxsLCQmzbtg133nknNE1z0kVERIT8N4oxxkrwD2AZqwWGDh2KRx99FAsXLsTvf/97rFmzBidPnsSLL75Y4XperxebNm3CrFmz8MQTTyA7OxuJiYkYM2YMnnrqqSr9SBIApkyZgrlz52LatGlISUlBVFQUJEnC6NGjSz0KUVmGYWDo0KH45Zdf8PXXX4cEvgUFBbj55pvhdrvx3HPPoVWrVggLC0N6ejqGDBlyyfvMysoqc7SYhg0bOsvPFxMTE/LZ5XIBwEX3n5WVVeajPuXtpzKaNm0KAEhLS0Pr1q2rvH5FkpKSMG7cOOfzp59+invvvRd/+ctfsHXr1ouu36JFC3Tr1u2i6aKjo0M+lwSt5c33+Xwh8xVFKVUnCQkJAM4d05MnT4KISn2ROD+v56cvWb+sbZbIysoK+YFweekqcmG+Abs9lbSl7OzscvNdXlkYY1c3DuY
"text/plain": [
"<Figure size 1000x50 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import torch\n",
"\n",
"# Example tensor, replace with cls_embedding\n",
"# cls_embedding = torch.randn(1, 768) # Simulating an embedding tensor\n",
"\n",
"# Plotting the tensor values\n",
"plt.figure(figsize=(10, 0.5))\n",
"plt.imshow(cls_embedding, aspect='auto', cmap='viridis')\n",
"plt.colorbar()\n",
"plt.title(\"Visualization of CLS Embedding\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e21e7fc-aaa3-4a34-9720-69ba897e06f5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}