diff --git a/Prototype_Beta_GCN_BERT_Embedding_Sysmon.ipynb b/Prototype_Beta_GCN_BERT_Embedding_Sysmon.ipynb new file mode 100644 index 0000000..19673a1 --- /dev/null +++ b/Prototype_Beta_GCN_BERT_Embedding_Sysmon.ipynb @@ -0,0 +1,301 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "53fefa0e-f261-47a5-9481-9c00e0d68e25", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n", + "Requirement already satisfied: torch==2.2.1+cpu in /home/marius/anaconda3/lib/python3.11/site-packages (2.2.1+cpu)\n", + "Requirement already satisfied: filelock in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.9.0)\n", + "Requirement already satisfied: typing-extensions>=4.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (4.11.0)\n", + "Requirement already satisfied: sympy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (1.11.1)\n", + "Requirement already satisfied: networkx in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1)\n", + "Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1.2)\n", + "Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (2023.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch==2.2.1+cpu) (2.1.1)\n", + "Requirement already satisfied: mpmath>=0.19 in /home/marius/anaconda3/lib/python3.11/site-packages (from sympy->torch==2.2.1+cpu) (1.3.0)\n", + "Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "Requirement already satisfied: torch-geometric in /home/marius/anaconda3/lib/python3.11/site-packages (2.5.3)\n", + "Requirement already satisfied: tqdm in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (4.65.0)\n", + "Requirement already satisfied: numpy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.24.3)\n", + "Requirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.11.4)\n", + "Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (2023.4.0)\n", + "Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.1.2)\n", + "Requirement already satisfied: aiohttp in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.8.5)\n", + "Requirement already satisfied: requests in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (2.31.0)\n", + "Requirement already satisfied: pyparsing in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.0.9)\n", + "Requirement already satisfied: scikit-learn in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.3.0)\n", + "Requirement already satisfied: psutil>=5.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (5.9.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (22.1.0)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (2.0.4)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (6.0.2)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (4.0.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.8.1)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.3.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.2.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch-geometric) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (1.26.16)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (2023.11.17)\n", + "Requirement already satisfied: joblib>=1.1.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from scikit-learn->torch-geometric) (1.2.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from scikit-learn->torch-geometric) (2.2.0)\n", + "Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "Collecting torch-scatter\n", + " Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_scatter-2.1.2%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (511 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m511.8/511.8 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hInstalling collected packages: torch-scatter\n", + "Successfully installed torch-scatter-2.1.2+pt22cpu\n", + "Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "Collecting torch-sparse\n", + " Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_sparse-0.6.18%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m0m:02\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-sparse) (1.11.4)\n", + "Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /home/marius/anaconda3/lib/python3.11/site-packages (from scipy->torch-sparse) (1.24.3)\n", + "Installing collected packages: torch-sparse\n", + "Successfully installed torch-sparse-0.6.18+pt22cpu\n", + "Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "Collecting torch-cluster\n", + " Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_cluster-1.6.3%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (776 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m776.6/776.6 kB\u001b[0m \u001b[31m982.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m954.7 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-cluster) (1.11.4)\n", + "Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /home/marius/anaconda3/lib/python3.11/site-packages (from scipy->torch-cluster) (1.24.3)\n", + "Installing collected packages: torch-cluster\n", + "Successfully installed torch-cluster-1.6.3+pt22cpu\n", + "Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "Collecting torch-spline-conv\n", + " Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_spline_conv-1.2.2%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (215 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.8/215.8 kB\u001b[0m \u001b[31m427.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n", + "\u001b[?25hInstalling collected packages: torch-spline-conv\n", + "Successfully installed torch-spline-conv-1.2.2+pt22cpu\n" + ] + } + ], + "source": [ + "!pip install torch==2.2.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n", + "!pip install torch-geometric -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n", + "!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f6c8b735-a54b-4485-bd16-34fad500f60f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/marius/anaconda3/lib/python3.11/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from transformers import BertTokenizer, BertModel\n", + "import networkx as nx\n", + "from torch_geometric.data import Data\n", + "from torch_geometric.nn import GCNConv\n", + "from torch_geometric.utils import from_networkx" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ee9877b3-ece3-4f37-bc1b-73e0994325c7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GCN embeddings shape: torch.Size([7, 768])\n", + "tensor([[-0.2238, -0.1572, 0.0377, ..., -0.1945, -0.2714, 0.7107],\n", + " [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n", + " [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n", + " ...,\n", + " [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n", + " [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n", + " [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644]])\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import torch\n", + "import torch.nn.functional as F\n", + "import networkx as nx\n", + "from transformers import BertTokenizer, BertModel\n", + "from torch_geometric.data import Data\n", + "from torch_geometric.nn import GCNConv\n", + "from torch_geometric.utils import from_networkx\n", + "\n", + "# Sample CSV data\n", + "data = {\n", + " \"Timestamp\": [\"2024-05-15T16:00:15.887Z\"],\n", + " \"OS\": [\"win10\"],\n", + " \"IP\": [\"fe80::24b4:3691:44a6:38a1\"],\n", + " \"LogLevel\": [\"information\"],\n", + " \"EventID\": [5379],\n", + " \"Category\": [\"User Account Management\"],\n", + " \"Description\": [\n", + " \"\"\"Credential Manager credentials were read.\n", + "\n", + " Subject:\n", + " Security ID: S-1-5-18\n", + " Account Name: WIN10$\n", + " Account Domain: sec699-20\n", + " Logon ID: 0x3E7\n", + " Read Operation: Enumerate Credentials\n", + "\n", + " This event occurs when a user performs a read operation on stored credentials in Credential Manager.\"\"\"\n", + " ]\n", + "}\n", + "\n", + "# Convert to DataFrame\n", + "df = pd.DataFrame(data)\n", + "\n", + "# Function to parse hierarchical data from description\n", + "def parse_description(description):\n", + " hierarchy = {}\n", + " lines = description.split('\\n')\n", + " key = None\n", + " for line in lines:\n", + " line = line.strip()\n", + " if not line:\n", + " continue\n", + " if ':' in line:\n", + " parts = line.split(':', 1)\n", + " key = parts[0].strip()\n", + " value = parts[1].strip()\n", + " hierarchy[key] = value\n", + " elif key:\n", + " hierarchy[key] += ' ' + line.strip()\n", + " return hierarchy\n", + "\n", + "# Apply parsing to the Description column\n", + "df['ParsedDescription'] = df['Description'].apply(parse_description)\n", + "\n", + "# Initialize graph\n", + "G = nx.Graph()\n", + "\n", + "# Load pre-trained BERT model and tokenizer\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "model = BertModel.from_pretrained('bert-base-uncased')\n", + "\n", + "# Add nodes and edges to the graph\n", + "for index, row in df.iterrows():\n", + " parsed = row['ParsedDescription']\n", + " description = row['Description']\n", + " \n", + " # Tokenize and encode the description\n", + " inputs = tokenizer(description, return_tensors='pt', max_length=512, truncation=True, padding='max_length')\n", + " outputs = model(**inputs)\n", + " description_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()\n", + " \n", + " # Add event node with BERT embedding\n", + " event_node = 'Event_{}'.format(index)\n", + " G.add_node(event_node, type='event', embedding=description_embedding)\n", + " \n", + " # Add other nodes and connect to event node\n", + " for key, value in parsed.items():\n", + " if not G.has_node(value):\n", + " G.add_node(value, type=key, embedding=torch.zeros(768).numpy()) # Add default embedding for non-event nodes\n", + " G.add_edge(event_node, value)\n", + "\n", + "# Convert NetworkX graph to PyTorch Geometric data\n", + "data = from_networkx(G)\n", + "\n", + "# Add node features to the data object\n", + "node_embeddings = [G.nodes[node]['embedding'] for node in G.nodes]\n", + "data.x = torch.tensor(node_embeddings, dtype=torch.float)\n", + "\n", + "# Define a simple GCN model\n", + "class GCN(torch.nn.Module):\n", + " def __init__(self):\n", + " super(GCN, self).__init__()\n", + " self.conv1 = GCNConv(data.num_node_features, 256)\n", + " self.conv2 = GCNConv(256, 128)\n", + " self.conv3 = GCNConv(128, 768) # Output size matches the input embedding size\n", + "\n", + " def forward(self, data):\n", + " x, edge_index = data.x, data.edge_index\n", + " x = self.conv1(x, edge_index)\n", + " x = F.relu(x)\n", + " x = self.conv2(x, edge_index)\n", + " x = F.relu(x)\n", + " x = self.conv3(x, edge_index)\n", + " return x\n", + "\n", + "# Create and train the model\n", + "model = GCN()\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n", + "\n", + "model.train()\n", + "for epoch in range(200):\n", + " optimizer.zero_grad()\n", + " out = model(data)\n", + " loss = F.mse_loss(out, data.x) # Dummy loss for example purposes\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + "# Extract the embeddings from the trained model\n", + "model.eval()\n", + "with torch.no_grad():\n", + " embeddings = model(data)\n", + "\n", + "# Print the embeddings\n", + "print(\"GCN embeddings shape:\", embeddings.shape)\n", + "print(embeddings)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdb7c728-a8c2-4e89-bea8-d0f31585a7cc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}