Prototype Beta, Graph Convolutional Network Embeddings with BERT with Sysmon data

This commit is contained in:
Marius Ciepluch 2024-05-18 18:24:24 +02:00
parent ac1311744c
commit da68bc33ba

View File

@ -0,0 +1,301 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "53fefa0e-f261-47a5-9481-9c00e0d68e25",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
"Requirement already satisfied: torch==2.2.1+cpu in /home/marius/anaconda3/lib/python3.11/site-packages (2.2.1+cpu)\n",
"Requirement already satisfied: filelock in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.9.0)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (4.11.0)\n",
"Requirement already satisfied: sympy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (1.11.1)\n",
"Requirement already satisfied: networkx in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1)\n",
"Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1.2)\n",
"Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (2023.4.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch==2.2.1+cpu) (2.1.1)\n",
"Requirement already satisfied: mpmath>=0.19 in /home/marius/anaconda3/lib/python3.11/site-packages (from sympy->torch==2.2.1+cpu) (1.3.0)\n",
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"Requirement already satisfied: torch-geometric in /home/marius/anaconda3/lib/python3.11/site-packages (2.5.3)\n",
"Requirement already satisfied: tqdm in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (4.65.0)\n",
"Requirement already satisfied: numpy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.24.3)\n",
"Requirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.11.4)\n",
"Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (2023.4.0)\n",
"Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.1.2)\n",
"Requirement already satisfied: aiohttp in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.8.5)\n",
"Requirement already satisfied: requests in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (2.31.0)\n",
"Requirement already satisfied: pyparsing in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.0.9)\n",
"Requirement already satisfied: scikit-learn in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.3.0)\n",
"Requirement already satisfied: psutil>=5.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (5.9.0)\n",
"Requirement already satisfied: attrs>=17.3.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (22.1.0)\n",
"Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (2.0.4)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (6.0.2)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (4.0.2)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.8.1)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.3.3)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.2.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch-geometric) (2.1.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (3.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (1.26.16)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (2023.11.17)\n",
"Requirement already satisfied: joblib>=1.1.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from scikit-learn->torch-geometric) (1.2.0)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from scikit-learn->torch-geometric) (2.2.0)\n",
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"Collecting torch-scatter\n",
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_scatter-2.1.2%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (511 kB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m511.8/511.8 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hInstalling collected packages: torch-scatter\n",
"Successfully installed torch-scatter-2.1.2+pt22cpu\n",
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"Collecting torch-sparse\n",
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_sparse-0.6.18%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m0m:02\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-sparse) (1.11.4)\n",
"Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /home/marius/anaconda3/lib/python3.11/site-packages (from scipy->torch-sparse) (1.24.3)\n",
"Installing collected packages: torch-sparse\n",
"Successfully installed torch-sparse-0.6.18+pt22cpu\n",
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"Collecting torch-cluster\n",
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_cluster-1.6.3%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (776 kB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m776.6/776.6 kB\u001b[0m \u001b[31m982.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m954.7 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-cluster) (1.11.4)\n",
"Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /home/marius/anaconda3/lib/python3.11/site-packages (from scipy->torch-cluster) (1.24.3)\n",
"Installing collected packages: torch-cluster\n",
"Successfully installed torch-cluster-1.6.3+pt22cpu\n",
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"Collecting torch-spline-conv\n",
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_spline_conv-1.2.2%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (215 kB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.8/215.8 kB\u001b[0m \u001b[31m427.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n",
"\u001b[?25hInstalling collected packages: torch-spline-conv\n",
"Successfully installed torch-spline-conv-1.2.2+pt22cpu\n"
]
}
],
"source": [
"!pip install torch==2.2.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
"!pip install torch-geometric -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
"!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f6c8b735-a54b-4485-bd16-34fad500f60f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/marius/anaconda3/lib/python3.11/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" torch.utils._pytree._register_pytree_node(\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import torch\n",
"import torch.nn.functional as F\n",
"from transformers import BertTokenizer, BertModel\n",
"import networkx as nx\n",
"from torch_geometric.data import Data\n",
"from torch_geometric.nn import GCNConv\n",
"from torch_geometric.utils import from_networkx"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ee9877b3-ece3-4f37-bc1b-73e0994325c7",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GCN embeddings shape: torch.Size([7, 768])\n",
"tensor([[-0.2238, -0.1572, 0.0377, ..., -0.1945, -0.2714, 0.7107],\n",
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
" ...,\n",
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644]])\n"
]
}
],
"source": [
"import pandas as pd\n",
"import torch\n",
"import torch.nn.functional as F\n",
"import networkx as nx\n",
"from transformers import BertTokenizer, BertModel\n",
"from torch_geometric.data import Data\n",
"from torch_geometric.nn import GCNConv\n",
"from torch_geometric.utils import from_networkx\n",
"\n",
"# Sample CSV data\n",
"data = {\n",
" \"Timestamp\": [\"2024-05-15T16:00:15.887Z\"],\n",
" \"OS\": [\"win10\"],\n",
" \"IP\": [\"fe80::24b4:3691:44a6:38a1\"],\n",
" \"LogLevel\": [\"information\"],\n",
" \"EventID\": [5379],\n",
" \"Category\": [\"User Account Management\"],\n",
" \"Description\": [\n",
" \"\"\"Credential Manager credentials were read.\n",
"\n",
" Subject:\n",
" Security ID: S-1-5-18\n",
" Account Name: WIN10$\n",
" Account Domain: sec699-20\n",
" Logon ID: 0x3E7\n",
" Read Operation: Enumerate Credentials\n",
"\n",
" This event occurs when a user performs a read operation on stored credentials in Credential Manager.\"\"\"\n",
" ]\n",
"}\n",
"\n",
"# Convert to DataFrame\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Function to parse hierarchical data from description\n",
"def parse_description(description):\n",
" hierarchy = {}\n",
" lines = description.split('\\n')\n",
" key = None\n",
" for line in lines:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" if ':' in line:\n",
" parts = line.split(':', 1)\n",
" key = parts[0].strip()\n",
" value = parts[1].strip()\n",
" hierarchy[key] = value\n",
" elif key:\n",
" hierarchy[key] += ' ' + line.strip()\n",
" return hierarchy\n",
"\n",
"# Apply parsing to the Description column\n",
"df['ParsedDescription'] = df['Description'].apply(parse_description)\n",
"\n",
"# Initialize graph\n",
"G = nx.Graph()\n",
"\n",
"# Load pre-trained BERT model and tokenizer\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
"model = BertModel.from_pretrained('bert-base-uncased')\n",
"\n",
"# Add nodes and edges to the graph\n",
"for index, row in df.iterrows():\n",
" parsed = row['ParsedDescription']\n",
" description = row['Description']\n",
" \n",
" # Tokenize and encode the description\n",
" inputs = tokenizer(description, return_tensors='pt', max_length=512, truncation=True, padding='max_length')\n",
" outputs = model(**inputs)\n",
" description_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()\n",
" \n",
" # Add event node with BERT embedding\n",
" event_node = 'Event_{}'.format(index)\n",
" G.add_node(event_node, type='event', embedding=description_embedding)\n",
" \n",
" # Add other nodes and connect to event node\n",
" for key, value in parsed.items():\n",
" if not G.has_node(value):\n",
" G.add_node(value, type=key, embedding=torch.zeros(768).numpy()) # Add default embedding for non-event nodes\n",
" G.add_edge(event_node, value)\n",
"\n",
"# Convert NetworkX graph to PyTorch Geometric data\n",
"data = from_networkx(G)\n",
"\n",
"# Add node features to the data object\n",
"node_embeddings = [G.nodes[node]['embedding'] for node in G.nodes]\n",
"data.x = torch.tensor(node_embeddings, dtype=torch.float)\n",
"\n",
"# Define a simple GCN model\n",
"class GCN(torch.nn.Module):\n",
" def __init__(self):\n",
" super(GCN, self).__init__()\n",
" self.conv1 = GCNConv(data.num_node_features, 256)\n",
" self.conv2 = GCNConv(256, 128)\n",
" self.conv3 = GCNConv(128, 768) # Output size matches the input embedding size\n",
"\n",
" def forward(self, data):\n",
" x, edge_index = data.x, data.edge_index\n",
" x = self.conv1(x, edge_index)\n",
" x = F.relu(x)\n",
" x = self.conv2(x, edge_index)\n",
" x = F.relu(x)\n",
" x = self.conv3(x, edge_index)\n",
" return x\n",
"\n",
"# Create and train the model\n",
"model = GCN()\n",
"optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
"\n",
"model.train()\n",
"for epoch in range(200):\n",
" optimizer.zero_grad()\n",
" out = model(data)\n",
" loss = F.mse_loss(out, data.x) # Dummy loss for example purposes\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
"# Extract the embeddings from the trained model\n",
"model.eval()\n",
"with torch.no_grad():\n",
" embeddings = model(data)\n",
"\n",
"# Print the embeddings\n",
"print(\"GCN embeddings shape:\", embeddings.shape)\n",
"print(embeddings)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdb7c728-a8c2-4e89-bea8-d0f31585a7cc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}