{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "# Attention-based log-similarity clustering with FAISS and Linformer", "id": "a4a84ab109c95c68" }, { "metadata": {}, "cell_type": "markdown", "source": [ "## File Placement\n", "\n", "Place the following file in the same directory as this notebook:\n", "\n", "`lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024_filtered_with_vectors_clean.parquet`\n", "\n", "**SHA1 Hash:** `8349713e82c50b0c747b05e085e533d4b01e833a`\n", "\n", "You can download the file from [Kaggle](https://www.kaggle.com/datasets/mariusciepluch/log2ml-blindtest-maldoc-activity-capture).\n", "\n", "## Package Installation\n", "\n", "Install the required packages by running the following command:\n", "\n", "```bash\n", "pip install -r requirements.cpu.txt\n", "```\n", "\n", "Make sure to refer to the `/dependencies` folder for the `requirements.cpu.txt` file.\n" ], "id": "ba1d2ff7e2c690b6" }, { "metadata": {}, "cell_type": "markdown", "source": "# Preparation: read the parquet file with the trace and vector data (Linformer)", "id": "9c3e728d3766b157" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T07:35:03.602352Z", "start_time": "2024-08-11T07:32:30.187631Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "import numpy as np\n", "import json\n", "\n", "# Read from Parquet using pandas\n", "pdf_read = pd.read_parquet(\"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024_filtered_with_vectors_clean.parquet\")\n", "\n", "# Function to convert JSON-encoded strings back to numpy arrays\n", "def string_to_vector(s):\n", " return np.array(json.loads(s))\n", "\n", "# Convert JSON strings back to numpy arrays\n", "pdf_read['message_vector'] = pdf_read['message_vector_str'].apply(string_to_vector)\n", "\n", "# Verify the shape of the vector\n", "print(\"Vector lengths:\")\n", "print(pdf_read['message_vector'].apply(len).head())\n", "\n", "# Check a sample vector to ensure dimensionality is preserved\n", "sample_vector = pdf_read['message_vector'].iloc[0]\n", "print(f\"\\nSample vector shape: {sample_vector.shape}\")\n", "\n", "# Drop the string column as it's no longer needed\n", "pdf_read = pdf_read.drop(columns='message_vector_str')\n", "\n", "# Verify vector lengths (equivalent to the Polars operation)\n", "print(\"\\nVector lengths (pandas equivalent of Polars operation):\")\n", "print(pdf_read['message_vector'].apply(len).head())\n", "\n", "# If you need to see the full DataFrame structure\n", "print(\"\\nDataFrame info:\")\n", "pdf_read.info()\n", "\n", "# If you want to see the first few rows of the DataFrame\n", "print(\"\\nFirst few rows of the DataFrame:\")\n", "print(pdf_read.head())" ], "id": "ed66a2e3503a0e58", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vector lengths:\n", "0 30000\n", "1 30000\n", "2 30000\n", "3 30000\n", "4 30000\n", "Name: message_vector, dtype: int64\n", "\n", "Sample vector shape: (30000,)\n", "\n", "Vector lengths (pandas equivalent of Polars operation):\n", "0 30000\n", "1 30000\n", "2 30000\n", "3 30000\n", "4 30000\n", "Name: message_vector, dtype: int64\n", "\n", "DataFrame info:\n", "\n", "RangeIndex: 13455 entries, 0 to 13454\n", "Data columns (total 16 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 @timestamp 13455 non-null datetime64[us, UTC]\n", " 1 host.hostname 13455 non-null object \n", " 2 host.ip 13455 non-null object \n", " 3 log.level 13455 non-null object \n", " 4 winlog.event_id 13455 non-null int64 \n", " 5 winlog.task 13455 non-null object \n", " 6 message 13455 non-null object \n", " 7 filtered_message 13455 non-null object \n", " 8 image 13455 non-null object \n", " 9 target_filename 13455 non-null object \n", " 10 parent_image 13455 non-null object \n", " 11 text 13455 non-null object \n", " 12 temp_folder 13455 non-null object \n", " 13 filename 13455 non-null object \n", " 14 label 13455 non-null object \n", " 15 message_vector 13455 non-null object \n", "dtypes: datetime64[us, UTC](1), int64(1), object(14)\n", "memory usage: 1.6+ MB\n", "\n", "First few rows of the DataFrame:\n", " @timestamp host.hostname host.ip \\\n", "0 2024-07-28 15:08:24.277000+00:00 win10 fe80::c1af:35de:6006:d4cf \n", "1 2024-07-28 15:08:24.488000+00:00 win10 fe80::c1af:35de:6006:d4cf \n", "2 2024-07-28 15:08:25.005000+00:00 win10 fe80::c1af:35de:6006:d4cf \n", "3 2024-07-28 15:08:25.005000+00:00 win10 fe80::c1af:35de:6006:d4cf \n", "4 2024-07-28 15:08:25.030000+00:00 win10 fe80::c1af:35de:6006:d4cf \n", "\n", " log.level winlog.event_id \\\n", "0 information 3 \n", "1 information 3 \n", "2 information 10 \n", "3 information 10 \n", "4 information 10 \n", "\n", " winlog.task \\\n", "0 Network connection detected (rule: NetworkConn... \n", "1 Network connection detected (rule: NetworkConn... \n", "2 Process accessed (rule: ProcessAccess) \n", "3 Process accessed (rule: ProcessAccess) \n", "4 Process accessed (rule: ProcessAccess) \n", "\n", " message \\\n", "0 Network connection detected:\\nRuleName: -\\nUtc... \n", "1 Network connection detected:\\nRuleName: -\\nUtc... \n", "2 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "3 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "4 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "\n", " filtered_message \\\n", "0 Network connection detected: \\nRuleName: -\\nPr... \n", "1 Network connection detected: \\nRuleName: -\\nPr... \n", "2 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "3 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "4 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "\n", " image target_filename parent_image \\\n", "0 C:\\Windows\\System32\\svchost.exe \n", "1 C:\\Windows\\System32\\svchost.exe \n", "2 C:\\Windows\\system32\\svchost.exe \n", "3 C:\\Windows\\system32\\svchost.exe \n", "4 C:\\Windows\\system32\\svchost.exe \n", "\n", " text temp_folder filename \\\n", "0 Network connection detected: \\nRuleName: -\\nPr... No \n", "1 Network connection detected: \\nRuleName: -\\nPr... No \n", "2 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "3 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "4 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "\n", " label message_vector \n", "0 good [0.24616119265556335, -0.2502608895301819, 0.1... \n", "1 good [0.23642203211784363, -0.24263174831867218, 0.... \n", "2 good [0.27883192896842957, -0.11810377985239029, 0.... \n", "3 good [0.2748359739780426, -0.10700400173664093, 0.0... \n", "4 good [0.26899218559265137, -0.1174423024058342, 0.0... \n" ] } ], "execution_count": 5 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Loading the dataset\n", "\n", "`X` is the typical variable for a NumPy array." ], "id": "28854db2c4cd55d3" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T07:37:43.771465Z", "start_time": "2024-08-11T07:37:42.081740Z" } }, "cell_type": "code", "source": [ "import numpy as np\n", "\n", "df_f = pdf_read\n", "\n", "print(df_f)\n", "\n", "print()\n", "# Convert the 'message_vector' column to a NumPy array\n", "X = np.array(df_f['message_vector'].to_list())\n", "print(\"Original data shape:\", X.shape)\n", "del(pdf_read) # Free up memory" ], "id": "def1f0da1ca9fb85", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " @timestamp host.hostname \\\n", "0 2024-07-28 15:08:24.277000+00:00 win10 \n", "1 2024-07-28 15:08:24.488000+00:00 win10 \n", "2 2024-07-28 15:08:25.005000+00:00 win10 \n", "3 2024-07-28 15:08:25.005000+00:00 win10 \n", "4 2024-07-28 15:08:25.030000+00:00 win10 \n", "... ... ... \n", "13450 2024-07-28 23:35:53.054000+00:00 win10 \n", "13451 2024-07-28 23:35:53.054000+00:00 win10 \n", "13452 2024-07-28 23:35:54.133000+00:00 win10 \n", "13453 2024-07-28 23:35:54.133000+00:00 win10 \n", "13454 2024-07-28 23:41:55.301000+00:00 win10 \n", "\n", " host.ip log.level winlog.event_id \\\n", "0 fe80::c1af:35de:6006:d4cf information 3 \n", "1 fe80::c1af:35de:6006:d4cf information 3 \n", "2 fe80::c1af:35de:6006:d4cf information 10 \n", "3 fe80::c1af:35de:6006:d4cf information 10 \n", "4 fe80::c1af:35de:6006:d4cf information 10 \n", "... ... ... ... \n", "13450 fe80::c1af:35de:6006:d4cf information 10 \n", "13451 fe80::c1af:35de:6006:d4cf information 10 \n", "13452 fe80::c1af:35de:6006:d4cf information 10 \n", "13453 fe80::c1af:35de:6006:d4cf information 10 \n", "13454 fe80::c1af:35de:6006:d4cf information 1 \n", "\n", " winlog.task \\\n", "0 Network connection detected (rule: NetworkConn... \n", "1 Network connection detected (rule: NetworkConn... \n", "2 Process accessed (rule: ProcessAccess) \n", "3 Process accessed (rule: ProcessAccess) \n", "4 Process accessed (rule: ProcessAccess) \n", "... ... \n", "13450 Process accessed (rule: ProcessAccess) \n", "13451 Process accessed (rule: ProcessAccess) \n", "13452 Process accessed (rule: ProcessAccess) \n", "13453 Process accessed (rule: ProcessAccess) \n", "13454 Process Create (rule: ProcessCreate) \n", "\n", " message \\\n", "0 Network connection detected:\\nRuleName: -\\nUtc... \n", "1 Network connection detected:\\nRuleName: -\\nUtc... \n", "2 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "3 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "4 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "... ... \n", "13450 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "13451 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "13452 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "13453 Process accessed:\\nRuleName: -\\nUtcTime: 2024-... \n", "13454 Process Create:\\nRuleName: -\\nUtcTime: 2024-07... \n", "\n", " filtered_message \\\n", "0 Network connection detected: \\nRuleName: -\\nPr... \n", "1 Network connection detected: \\nRuleName: -\\nPr... \n", "2 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "3 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "4 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "... ... \n", "13450 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "13451 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "13452 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "13453 Process accessed: \\nRuleName: -\\nSourceProcess... \n", "13454 Process Create: \\nRuleName: -\\nProcessId: 1074... \n", "\n", " image target_filename parent_image \\\n", "0 C:\\Windows\\System32\\svchost.exe \n", "1 C:\\Windows\\System32\\svchost.exe \n", "2 C:\\Windows\\system32\\svchost.exe \n", "3 C:\\Windows\\system32\\svchost.exe \n", "4 C:\\Windows\\system32\\svchost.exe \n", "... ... ... ... \n", "13450 C:\\Windows\\system32\\svchost.exe \n", "13451 C:\\Windows\\system32\\svchost.exe \n", "13452 C:\\Windows\\system32\\svchost.exe \n", "13453 C:\\Windows\\system32\\svchost.exe \n", "13454 C:\\Windows\\System32\\svchost.exe services.exe \n", "\n", " text temp_folder filename \\\n", "0 Network connection detected: \\nRuleName: -\\nPr... No \n", "1 Network connection detected: \\nRuleName: -\\nPr... No \n", "2 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "3 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "4 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "... ... ... ... \n", "13450 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "13451 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "13452 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "13453 Process accessed: \\nRuleName: -\\nSourceProcess... No \n", "13454 Process Create: \\nRuleName: -\\nProcessId: 1074... No \n", "\n", " label message_vector \n", "0 good [0.24616119265556335, -0.2502608895301819, 0.1... \n", "1 good [0.23642203211784363, -0.24263174831867218, 0.... \n", "2 good [0.27883192896842957, -0.11810377985239029, 0.... \n", "3 good [0.2748359739780426, -0.10700400173664093, 0.0... \n", "4 good [0.26899218559265137, -0.1174423024058342, 0.0... \n", "... ... ... \n", "13450 good [0.2619136571884155, -0.13029363751411438, 0.0... \n", "13451 good [0.26092982292175293, -0.12313028424978256, 0.... \n", "13452 good [0.26543211936950684, -0.12485812604427338, 0.... \n", "13453 good [0.28400424122810364, -0.11322563141584396, 0.... \n", "13454 good [0.24720659852027893, -0.2456829845905304, 0.1... \n", "\n", "[13455 rows x 16 columns]\n", "\n", "Original data shape: (13455, 30000)\n" ] } ], "execution_count": 6 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Direct insertion into FAISS\n", "\n", "No pre-processing." ], "id": "ac0ca03b76a3a17" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T09:13:52.171351Z", "start_time": "2024-08-11T09:13:18.527761Z" } }, "cell_type": "code", "source": [ "import numpy as np\n", "from langchain_community.vectorstores import FAISS\n", "from langchain.schema import Document\n", "from langchain.embeddings.base import Embeddings\n", "\n", "# Assuming df_f is your DataFrame and X is your numpy array of vectors\n", "X = np.array(df_f['message_vector'].to_list())\n", "\n", "# Create Document objects\n", "documents = [Document(page_content=text, metadata={'index': i}) for i, text in enumerate(df_f['message'])]\n", "\n", "# Create a custom Embeddings class for pre-computed vectors\n", "class PrecomputedEmbeddings(Embeddings):\n", " def __init__(self, vectors):\n", " self.vectors = vectors\n", "\n", " def embed_documents(self, texts):\n", " # Return all vectors, assuming order matches\n", " return self.vectors.tolist()\n", "\n", " def embed_query(self, text):\n", " # This method is required but won't be used for indexing\n", " # Return a zero vector of the same dimension as your embeddings\n", " return np.zeros(self.vectors.shape[1]).tolist()\n", "\n", " # Adding this method to conform to the Embeddings interface\n", " def embed_text(self, text):\n", " return self.embed_query(text)\n", "\n", "# Create embeddings object\n", "embeddings = PrecomputedEmbeddings(X)\n", "\n", "# Create FAISS index\n", "db = FAISS.from_documents(documents, embeddings, distance_strategy=\"COSINE\")\n", "\n", "# Save the index locally\n", "db.save_local(\"faiss_index_sysmon_cosine\")\n", "\n", "print(\"FAISS index created and saved successfully.\")" ], "id": "fc17dfa046912457", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FAISS index created and saved successfully.\n" ] } ], "execution_count": 78 }, { "metadata": {}, "cell_type": "markdown", "source": [ "# Preparation: init the vector function (Linformer) - CPU\n", "\n", "This is for the search queries." ], "id": "b26253dcf651f61a" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T08:04:08.242781Z", "start_time": "2024-08-11T08:04:07.275145Z" } }, "cell_type": "code", "source": [ "from linformer_pytorch import LinformerLM\n", "import torch\n", "from tokenizers import Tokenizer\n", "\n", "# Define the device\n", "device = torch.device(\"cpu\")\n", "\n", "print(\"This uses a \" + str(device) + \" device\")\n", "\n", "# Load the custom tokenizer\n", "tokenizer = Tokenizer.from_file(\"log_tokenizer.json\")\n", "\n", "# Initialize the Linformer model\n", "linformer_model = LinformerLM(\n", " num_tokens=30000,\n", " input_size=700,\n", " channels=64,\n", " dim_k=128,\n", " dim_ff=128,\n", " dropout_ff=0.15,\n", " nhead=4,\n", " depth=2,\n", " dropout=0.1,\n", " activation=\"gelu\",\n", " checkpoint_level=\"C0\",\n", " parameter_sharing=\"layerwise\",\n", " emb_dim=128,\n", ").to(device)\n", "\n", "def vectorize_text(text):\n", " MAX_LENGTH = 700\n", "\n", " # Tokenize using the custom tokenizer\n", " encoded = tokenizer.encode(text)\n", "\n", " # Get token IDs\n", " input_ids = encoded.ids\n", "\n", " # Ensure the input_ids length is exactly MAX_LENGTH\n", " input_ids = input_ids[:MAX_LENGTH] if len(input_ids) > MAX_LENGTH else input_ids + [0] * (MAX_LENGTH - len(input_ids))\n", "\n", " # Convert to PyTorch tensor and move to CPU\n", " input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)\n", "\n", " # Get the model outputs\n", " with torch.no_grad():\n", " outputs = linformer_model(input_ids)\n", "\n", " # Assuming outputs is the tensor of interest\n", " vector = outputs.mean(dim=1)\n", " return vector.numpy()" ], "id": "872e525dac4192", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This uses a cpu device\n", "Vector shape: (1, 30000)\n", "Vector: [[0.3499783 0.20774072 0.2604245 ... 0.21256167 0.11159717 0.01982626]]\n" ] } ], "execution_count": 14 }, { "metadata": {}, "cell_type": "markdown", "source": "# Test 1: using FAISS for string matches", "id": "c597512bf39274e6" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T09:04:06.021271Z", "start_time": "2024-08-11T09:04:05.979544Z" } }, "cell_type": "code", "source": [ "# Vectorize a message of interest\n", "\n", "interesting_log_line = r\"\"\"\n", "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", "\"\"\"\n", "\n", "# Vectorize the message\n", "vectorized_log = vectorize_text(interesting_log_line)\n", "print(\"Vectorized log shape:\", vectorized_log.shape)" ], "id": "9c62d4b06bfbd468", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vectorized log shape: (1, 30000)\n" ] } ], "execution_count": 70 }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T08:44:25.242299Z", "start_time": "2024-08-11T08:44:25.221213Z" } }, "cell_type": "code", "source": [ "import numpy as np\n", "import faiss\n", "from sklearn.preprocessing import normalize\n", "\n", "# Function to find partial matches\n", "def find_partial_matches(db, interesting_log_line, excluded_strings):\n", " matches = []\n", " for i in range(db.index.ntotal):\n", " doc = db.docstore.search(db.index_to_docstore_id[i])\n", " if interesting_log_line.strip() in doc.page_content and not any(excluded in doc.page_content for excluded in excluded_strings):\n", " matches.append((doc, i))\n", " return matches\n", "\n", "# Ensure the query vector is a 2D numpy array of float32\n", "query_vector = vectorized_log.astype(np.float32)\n", "\n", "# Normalize the query vector\n", "query_vector = normalize(query_vector)\n", "\n", "# Debugging: Print the shape and type of query_vector\n", "print(\"Processed query_vector shape:\", query_vector.shape)\n", "print(\"Processed query_vector type:\", type(query_vector))\n", "\n", "# Parameters\n", "excluded_strings = [\n", " \"Image: C:\\\\Users\\\\student\\\\AppData\\\\Local\\\\miniconda3\\\\python.exe\",\n", " \"Image: C:\\\\Program Files (x86)\\\\Microsoft\\\\EdgeUpdate\\\\\",\n", " \"Image: C:\\\\Program Files\\\\Avast Software\\\\Avast\",\n", " \"SourceImage: C:\\\\ProgramData\\\\Microsoft\\\\Windows Defender\\\\platform\\\\4.18.2011.6-0\\\\MsMpEng.exe\",\n", " \"Image: C:\\\\Users\\\\student\\\\AppData\\\\Local\\\\Microsoft\\\\Teams\\\\current\\\\Teams.exe\",\n", " \"Image: C:\\\\Program Files (x86)\\\\Microsoft\\\\Edge\\\\Application\",\n", " \"SourceImage: C:\\\\Windows\"\n", "]\n", "max_matches_to_print = 3 # Number of partial matches to process and print\n", "\n", "try:\n", " # Get the raw FAISS index\n", " raw_index = db.index\n", "\n", " # Print index type for debugging\n", " print(f\"Index type: {type(raw_index)}\")\n", "\n", " # Find partial matches\n", " partial_matches = find_partial_matches(db, interesting_log_line, excluded_strings)\n", "\n", " if not partial_matches:\n", " print(\"No partial matches found for the interesting_log_line (excluding specified strings).\")\n", " else:\n", " print(f\"Found {len(partial_matches)} partial matches. Printing details for the first {max_matches_to_print}:\")\n", " \n", " for match_num, (match, match_index) in enumerate(partial_matches[:max_matches_to_print], 1):\n", " print(f\"\\nPartial match {match_num}:\")\n", " print(f\"Message: {match.page_content[:300]}...\") # Print first 300 characters\n", " print(f\"Metadata: {match.metadata}\")\n", "\n", "except Exception as e:\n", " print(f\"Error occurred: {e}\")\n", " print(\"FAISS index info:\")\n", " print(f\"Index size: {db.index.ntotal}\")\n", " print(f\"Index dimension: {db.index.d}\")" ], "id": "68578d23610b7f43", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processed query_vector shape: (1, 30000)\n", "Processed query_vector type: \n", "Index type: \n", "Found 114 partial matches. Printing details for the first 3:\n", "\n", "Partial match 1:\n", "Message: File created:\n", "RuleName: EXE\n", "UtcTime: 2024-07-28 15:12:53.459\n", "ProcessGuid: {18e8265a-5fef-66a6-f701-000000004400}\n", "ProcessId: 10072\n", "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", "CreationUtcTime: 2024-07-23 14:24:50.520...\n", "Metadata: {'index': 620}\n", "\n", "Partial match 2:\n", "Message: File created:\n", "RuleName: EXE\n", "UtcTime: 2024-07-28 15:44:34.527\n", "ProcessGuid: {18e8265a-675e-66a6-1905-000000004400}\n", "ProcessId: 8708\n", "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", "CreationUtcTime: 2024-07-23 14:24:50.520...\n", "Metadata: {'index': 4908}\n", "\n", "Partial match 3:\n", "Message: File created:\n", "RuleName: EXE\n", "UtcTime: 2024-07-28 15:52:39.361\n", "ProcessGuid: {18e8265a-6942-66a6-6a05-000000004400}\n", "ProcessId: 8648\n", "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", "CreationUtcTime: 2024-07-23 14:24:50.520...\n", "Metadata: {'index': 5063}\n" ] } ], "execution_count": 54 }, { "metadata": {}, "cell_type": "markdown", "source": "# Test 2: using FAISS for similarity search (Cosine distance)", "id": "15f768a0f95cae1b" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Vectorize a message of interest\n", "\n", "interesting_log_line = r\"\"\"\n", "File created:\n", "RuleName: EXE\n", "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", "\"\"\"\n", "\n", "# Vectorize the message\n", "vectorized_log = vectorize_text(interesting_log_line)\n", "print(\"Vectorized log shape:\", vectorized_log.shape)" ], "id": "922dedb1a2898b4e" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T09:07:48.072941Z", "start_time": "2024-08-11T09:07:07.337422Z" } }, "cell_type": "code", "source": [ "import numpy as np\n", "from langchain_community.vectorstores import FAISS\n", "from langchain.schema import Document\n", "from langchain.embeddings.base import Embeddings\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.decomposition import TruncatedSVD\n", "\n", "# Assuming df_f is your DataFrame and it contains the 'message_vector' and 'message' columns\n", "X = np.array(df_f['message_vector'].to_list())\n", "\n", "# Apply StandardScaler\n", "scaler = StandardScaler()\n", "X_scaled = scaler.fit_transform(X)\n", "\n", "# Apply TruncatedSVD (PCA)\n", "pca = TruncatedSVD(n_components=500, random_state=42)\n", "X_pca = pca.fit_transform(X_scaled)\n", "\n", "# Create Document objects\n", "documents = [Document(page_content=text, metadata={'index': i}) for i, text in enumerate(df_f['message'])]\n", "\n", "# Create a custom Embeddings class for pre-computed vectors\n", "class PrecomputedEmbeddings(Embeddings):\n", " def __init__(self, vectors):\n", " self.vectors = vectors\n", "\n", " def embed_documents(self, texts):\n", " # Return all vectors, assuming order matches\n", " return self.vectors.tolist()\n", "\n", " def embed_query(self, text):\n", " # This method is required but won't be used for indexing\n", " # Return a zero vector of the same dimension as your embeddings\n", " return np.zeros(self.vectors.shape[1]).tolist()\n", "\n", " def embed_text(self, text):\n", " return self.embed_query(text)\n", "\n", "# Create embeddings object with PCA-transformed vectors\n", "embeddings = PrecomputedEmbeddings(X_pca)\n", "\n", "# Create FAISS index\n", "db = FAISS.from_documents(documents, embeddings, distance_strategy=\"COSINE\")\n", "\n", "# Save the index locally\n", "db.save_local(\"faiss_index_sysmon_cosine_pca\")\n", "\n", "print(\"FAISS index created and saved successfully.\")\n", "\n", "# Save the scaler and PCA objects for later use\n", "import joblib\n", "joblib.dump(scaler, 'scaler.joblib')\n", "joblib.dump(pca, 'pca.joblib')\n", "\n", "print(\"Scaler and PCA objects saved for future use.\")\n", "\n", "# Print some information about the transformed data\n", "print(f\"Original vector shape: {X.shape}\")\n", "print(f\"PCA-transformed vector shape: {X_pca.shape}\")\n", "print(f\"Explained variance ratio sum: {pca.explained_variance_ratio_.sum():.4f}\")" ], "id": "6c21b4db811e6fa0", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FAISS index created and saved successfully.\n", "Scaler and PCA objects saved for future use.\n", "Original vector shape: (13455, 30000)\n", "PCA-transformed vector shape: (13455, 500)\n", "Explained variance ratio sum: 1.0000\n" ] } ], "execution_count": 75 }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T09:07:49.468033Z", "start_time": "2024-08-11T09:07:49.361450Z" } }, "cell_type": "code", "source": [ "import joblib\n", "import numpy as np\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.decomposition import TruncatedSVD\n", "from langchain_community.vectorstores import FAISS\n", "from langchain.embeddings.base import Embeddings\n", "\n", "# Load the saved scaler and PCA objects\n", "scaler = joblib.load('scaler.joblib')\n", "pca = joblib.load('pca.joblib')\n", "\n", "# Recreate the PrecomputedEmbeddings class (it needs to be defined before loading)\n", "class PrecomputedEmbeddings(Embeddings):\n", " def __init__(self, vectors):\n", " self.vectors = vectors\n", "\n", " def embed_documents(self, texts):\n", " return self.vectors.tolist()\n", "\n", " def embed_query(self, text):\n", " return np.zeros(self.vectors.shape[1]).tolist()\n", "\n", " def embed_text(self, text):\n", " return self.embed_query(text)\n", "\n", "# Create a dummy embeddings object (we'll replace its vectors later)\n", "embeddings = PrecomputedEmbeddings(np.zeros((1, 100)))\n", "\n", "# Load your FAISS index\n", "db = FAISS.load_local(\"faiss_index_sysmon_cosine_pca\", embeddings, allow_dangerous_deserialization=True)\n", "\n", "def preprocess_query(query_vector):\n", " # Ensure query_vector is 2D\n", " if query_vector.ndim == 1:\n", " query_vector = query_vector.reshape(1, -1)\n", " \n", " # Apply the same preprocessing as during index creation\n", " query_scaled = scaler.transform(query_vector)\n", " query_pca = pca.transform(query_scaled)\n", " \n", " return query_pca\n", "\n", "# Your query vector\n", "query_vector = vectorized_log.astype(np.float32)\n", "\n", "# Preprocess the query vector\n", "processed_query = preprocess_query(query_vector)\n", "\n", "# Now you can use this processed_query with your FAISS index\n", "# For example:\n", "results = db.similarity_search_by_vector(processed_query[0], k=5)\n", "\n", "# Print results\n", "for doc in results:\n", " print(f\"Score: {doc.metadata.get('score', 'N/A')}\")\n", " print(f\"Content: {doc.page_content[:300]}...\")\n", " print(\"---\")" ], "id": "a2070b029db84d7e", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Score: N/A\n", "Content: Process Create:\n", "RuleName: -\n", "UtcTime: 2024-07-28 15:38:31.608\n", "ProcessGuid: {18e8265a-65f7-66a6-f904-000000004400}\n", "ProcessId: 8612\n", "Image: C:\\Users\\student\\AppData\\Local\\miniconda3\\python.exe\n", "FileVersion: 3.12.4\n", "Description: Python\n", "Product: Python\n", "Company: Python Software Foundation\n", "OriginalFileName: p...\n", "---\n", "Score: N/A\n", "Content: Process Create:\n", "RuleName: -\n", "UtcTime: 2024-07-28 15:38:35.802\n", "ProcessGuid: {18e8265a-65fb-66a6-fe04-000000004400}\n", "ProcessId: 2224\n", "Image: C:\\Users\\student\\AppData\\Local\\miniconda3\\python.exe\n", "FileVersion: 3.12.4\n", "Description: Python\n", "Product: Python\n", "Company: Python Software Foundation\n", "OriginalFileName: p...\n", "---\n", "Score: N/A\n", "Content: Process Create:\n", "RuleName: -\n", "UtcTime: 2024-07-28 15:38:34.158\n", "ProcessGuid: {18e8265a-65fa-66a6-fc04-000000004400}\n", "ProcessId: 8200\n", "Image: C:\\Users\\student\\AppData\\Local\\miniconda3\\python.exe\n", "FileVersion: 3.12.4\n", "Description: Python\n", "Product: Python\n", "Company: Python Software Foundation\n", "OriginalFileName: p...\n", "---\n", "Score: N/A\n", "Content: Process Create:\n", "RuleName: -\n", "UtcTime: 2024-07-28 15:38:36.641\n", "ProcessGuid: {18e8265a-65fc-66a6-ff04-000000004400}\n", "ProcessId: 9208\n", "Image: C:\\Users\\student\\AppData\\Local\\miniconda3\\python.exe\n", "FileVersion: 3.12.4\n", "Description: Python\n", "Product: Python\n", "Company: Python Software Foundation\n", "OriginalFileName: p...\n", "---\n", "Score: N/A\n", "Content: Process Create:\n", "RuleName: -\n", "UtcTime: 2024-07-28 15:38:33.313\n", "ProcessGuid: {18e8265a-65f9-66a6-fb04-000000004400}\n", "ProcessId: 8448\n", "Image: C:\\Users\\student\\AppData\\Local\\miniconda3\\python.exe\n", "FileVersion: 3.12.4\n", "Description: Python\n", "Product: Python\n", "Company: Python Software Foundation\n", "OriginalFileName: p...\n", "---\n" ] } ], "execution_count": 76 }, { "metadata": {}, "cell_type": "markdown", "source": "# Test 3: using FAISS for similarity search (Cosine distance) with normalized vectors", "id": "216ee6b650e81d71" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Vectorize a message of interest\n", "\n", "interesting_log_line = r\"\"\"\n", "File created:\n", "RuleName: EXE\n", "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", "\"\"\"\n", "\n", "# Vectorize the message\n", "vectorized_log = vectorize_text(interesting_log_line)\n", "print(\"Vectorized log shape:\", vectorized_log.shape)" ], "id": "c3cef1ba97cdb226" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-11T09:10:28.089283Z", "start_time": "2024-08-11T09:09:50.983922Z" } }, "cell_type": "code", "source": [ "import numpy as np\n", "import faiss\n", "from sklearn.preprocessing import normalize\n", "from langchain_community.vectorstores import FAISS\n", "from langchain.embeddings.base import Embeddings\n", "\n", "# Load your original vectors (assuming you still have access to them)\n", "X = np.array(df_f['message_vector'].to_list())\n", "\n", "# Normalize the vectors (this makes L2 distance equivalent to cosine similarity)\n", "X_normalized = normalize(X, axis=1)\n", "\n", "class PrecomputedEmbeddings(Embeddings):\n", " def __init__(self, vectors):\n", " self.vectors = vectors\n", "\n", " def embed_documents(self, texts):\n", " return self.vectors.tolist()\n", "\n", " def embed_query(self, text):\n", " # This should not be called, but we'll keep it for compatibility\n", " return np.zeros(self.vectors.shape[1]).tolist()\n", "\n", "# Create embeddings object with normalized vectors\n", "embeddings = PrecomputedEmbeddings(X_normalized)\n", "\n", "# Create Document objects\n", "documents = [Document(page_content=text, metadata={'index': i}) for i, text in enumerate(df_f['message'])]\n", "\n", "# Create FAISS index\n", "db = FAISS.from_documents(documents, embeddings, distance_strategy=\"COSINE\")\n", "\n", "# Save the index\n", "db.save_local(\"faiss_index_sysmon_cosine_normalized\")\n", "\n", "print(\"FAISS index created and saved successfully.\")\n", "\n", "# Now, let's search:\n", "def search_similar(query_vector, k=5):\n", " # Normalize the query vector\n", " query_vector_normalized = normalize(query_vector.reshape(1, -1))[0]\n", " \n", " results = db.similarity_search_by_vector(query_vector_normalized, k=k)\n", " \n", " print(f\"Top {k} similar messages:\")\n", " for i, doc in enumerate(results, 1):\n", " print(f\"\\n{i}. Similarity Score: {doc.metadata.get('score', 'N/A')}\")\n", " print(f\"Message: {doc.page_content[:300]}...\") # Print first 300 characters\n", " print(f\"Metadata: {doc.metadata}\")\n", "\n", "# Your query vector\n", "query_vector = vectorized_log.astype(np.float32)\n", "\n", "# Search for similar messages\n", "search_similar(query_vector, k=5)" ], "id": "f9cd363d55965b81", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FAISS index created and saved successfully.\n", "Top 5 similar messages:\n", "\n", "1. Similarity Score: N/A\n", "Message: Dns query:\n", "RuleName: -\n", "UtcTime: 2024-07-28 18:27:54.471\n", "ProcessGuid: {00000000-0000-0000-0000-000000000000}\n", "ProcessId: 5940\n", "QueryName: dc.sec699-20.lab\n", "QueryStatus: 0\n", "QueryResults: ::ffff:192.168.20.101;\n", "Image: ...\n", "Metadata: {'index': 8550}\n", "\n", "2. Similarity Score: N/A\n", "Message: Dns query:\n", "RuleName: -\n", "UtcTime: 2024-07-28 17:24:01.108\n", "ProcessGuid: {18e8265a-7eb1-66a6-6a08-000000004400}\n", "ProcessId: 10084\n", "QueryName: dc.sec699-20.lab\n", "QueryStatus: 0\n", "QueryResults: ::ffff:192.168.20.101;\n", "Image: ...\n", "Metadata: {'index': 7393}\n", "\n", "3. Similarity Score: N/A\n", "Message: Registry value set:\n", "RuleName: Context,ProtectedModeExitOrMacrosUsed\n", "EventType: SetValue\n", "UtcTime: 2024-07-28 19:31:25.327\n", "ProcessGuid: {18e8265a-9c88-66a6-170c-000000004400}\n", "ProcessId: 10624\n", "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", "TargetObject: HKU\\S-1-5-21-3148146594-1027658...\n", "Metadata: {'index': 9736}\n", "\n", "4. Similarity Score: N/A\n", "Message: Process terminated:\n", "RuleName: -\n", "UtcTime: 2024-07-28 15:39:52.088\n", "ProcessGuid: {18e8265a-663c-66a6-0905-000000004400}\n", "ProcessId: 9528\n", "Image: C:\\Users\\student\\AppData\\Local\\miniconda3\\python.exe...\n", "Metadata: {'index': 4880}\n", "\n", "5. Similarity Score: N/A\n", "Message: Process terminated:\n", "RuleName: -\n", "UtcTime: 2024-07-28 19:16:57.077\n", "ProcessGuid: {18e8265a-9926-66a6-a50b-000000004400}\n", "ProcessId: 6200\n", "Image: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe...\n", "Metadata: {'index': 9468}\n" ] } ], "execution_count": 77 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }