diff --git a/Local_CPU_LLM_Bling_Non_Interactive.ipynb b/Local_CPU_LLM_Bling_Non_Interactive.ipynb new file mode 100644 index 0000000..6943032 --- /dev/null +++ b/Local_CPU_LLM_Bling_Non_Interactive.ipynb @@ -0,0 +1,255 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ea47b0b7196331ed", + "metadata": { + "collapsed": false + }, + "source": [ + "# Use a local CPU Large Language Model (LLM) to generate text\n", + "\n", + "This is a basic LLM, which \n", + "\n", + "* does not require a GPU\n", + "* is not fine-tuned for a specific task\n", + "* is not optimized for speed\n", + "* is not optimized for memory usage\n", + "* has a smaller model size\n", + "* ...\n", + "* is not as good as a GPU LLM\n", + "* is not as good as a fine-tuned LLM\n", + "* is not as good as a larger LLM\n", + "* ...\n", + "\n", + "Its purpose is to allow on-premises and self-hosted use of LLMs. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "initial_id", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-17T11:26:30.714741Z", + "start_time": "2024-03-17T11:26:30.711615Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "# You need to manage the dependencies of LangChain with\n", + "# the requirements.txt file. The versions are pinned.\n", + "# %pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "c96a287c1fc724d2", + "metadata": { + "collapsed": false + }, + "source": [ + "## Use the Hugging Face pipeline with LLMware Bling\n", + "\n", + "* The Hugging Face pipeline is a convenient way to use a pre-trained model.\n", + "* LLMware Bling is a CPU LLM.\n", + "* The config of this model is to allow remote code from Hugging Face." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2108b1c9373e0ec8", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-17T11:23:02.052134Z", + "start_time": "2024-03-17T11:22:45.974223Z" + }, + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading file vocab.json from cache at None\n", + "loading file merges.txt from cache at None\n", + "loading file tokenizer.json from cache at /home/marius/.cache/huggingface/hub/models--llmware--bling-stable-lm-3b-4e1t-v0/snapshots/a9e4d8d478d76dd062d9acd01b6ce3417217a344/tokenizer.json\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at /home/marius/.cache/huggingface/hub/models--llmware--bling-stable-lm-3b-4e1t-v0/snapshots/a9e4d8d478d76dd062d9acd01b6ce3417217a344/special_tokens_map.json\n", + "loading file tokenizer_config.json from cache at /home/marius/.cache/huggingface/hub/models--llmware--bling-stable-lm-3b-4e1t-v0/snapshots/a9e4d8d478d76dd062d9acd01b6ce3417217a344/tokenizer_config.json\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "loading configuration file config.json from cache at /home/marius/.cache/huggingface/hub/models--llmware--bling-stable-lm-3b-4e1t-v0/snapshots/a9e4d8d478d76dd062d9acd01b6ce3417217a344/config.json\n", + "loading configuration file config.json from cache at /home/marius/.cache/huggingface/hub/models--llmware--bling-stable-lm-3b-4e1t-v0/snapshots/a9e4d8d478d76dd062d9acd01b6ce3417217a344/config.json\n", + "Model config StableLMEpochConfig {\n", + " \"_name_or_path\": \"llmware/bling-stable-lm-3b-4e1t-v0\",\n", + " \"architectures\": [\n", + " \"StableLMEpochForCausalLM\"\n", + " ],\n", + " \"auto_map\": {\n", + " \"AutoConfig\": \"llmware/bling-stable-lm-3b-4e1t-v0--configuration_stablelm_epoch.StableLMEpochConfig\",\n", + " \"AutoModelForCausalLM\": \"llmware/bling-stable-lm-3b-4e1t-v0--modeling_stablelm_epoch.StableLMEpochForCausalLM\"\n", + " },\n", + " \"bos_token_id\": 0,\n", + " \"eos_token_id\": 0,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 2560,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 6912,\n", + " \"max_position_embeddings\": 4096,\n", + " \"model_type\": \"stablelm_epoch\",\n", + " \"norm_eps\": 1e-05,\n", + " \"num_attention_heads\": 32,\n", + " \"num_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 32,\n", + " \"rope_pct\": 0.25,\n", + " \"rope_theta\": 10000,\n", + " \"rotary_scaling_factor\": 1.0,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.38.2\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50304\n", + "}\n", + "\n", + "loading weights file pytorch_model.bin from cache at /home/marius/.cache/huggingface/hub/models--llmware--bling-stable-lm-3b-4e1t-v0/snapshots/a9e4d8d478d76dd062d9acd01b6ce3417217a344/pytorch_model.bin\n", + "Generate config GenerationConfig {\n", + " \"bos_token_id\": 0,\n", + " \"eos_token_id\": 0\n", + "}\n", + "\n", + "All model checkpoint weights were used when initializing StableLMEpochForCausalLM.\n", + "\n", + "All the weights of StableLMEpochForCausalLM were initialized from the model checkpoint at llmware/bling-stable-lm-3b-4e1t-v0.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use StableLMEpochForCausalLM for predictions without further training.\n", + "loading configuration file generation_config.json from cache at /home/marius/.cache/huggingface/hub/models--llmware--bling-stable-lm-3b-4e1t-v0/snapshots/a9e4d8d478d76dd062d9acd01b6ce3417217a344/generation_config.json\n", + "Generate config GenerationConfig {\n", + " \"bos_token_id\": 0,\n", + " \"eos_token_id\": 0\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n", + "\n", + "model_id = \"llmware/bling-stable-lm-3b-4e1t-v0\"\n", + "\n", + "# Ensure the directory for saving models is created and specified in your environment\n", + "# This is more about ensuring that the model download doesn't prompt for storage location or confirmation\n", + "import os\n", + "from transformers import logging\n", + "\n", + "# Optionally, increase logging level if you want to see more details about the download process\n", + "logging.set_verbosity_info()\n", + "\n", + "# Make sure you have set TRANSFORMERS_CACHE in your environment variables\n", + "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/path/to/your/preferred/cache/directory\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)\n", + "\n", + "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, max_new_tokens=500)\n", + "hf = HuggingFacePipeline(pipeline=pipe)\n" + ] + }, + { + "cell_type": "markdown", + "id": "904e6bf72c2ecf27", + "metadata": { + "collapsed": false + }, + "source": [ + "## Use the Hugging Face pipeline with LLMware Bling via LangChain\n", + "\n", + "* This is a basic prompt template with LangChain\n", + "* The question is passed to the model via a chain" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1827b8c3423066b0", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-17T11:23:25.839334Z", + "start_time": "2024-03-17T11:23:02.070024Z" + }, + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Disabling tokenizer parallelism, we're using DataLoader multithreading already\n", + "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n" + ] + } + ], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "\n", + "template = \"\"\"Question: {question}\n", + "\n", + "Answer: Let's think step by step.\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "\n", + "chain = prompt | hf\n", + "\n", + "question = \"What is electroencephalography?\"\n", + "\n", + "test = chain.invoke({\"question\": question})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ac2a19b6fb9aa3e2", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-17T11:23:25.847308Z", + "start_time": "2024-03-17T11:23:25.841002Z" + }, + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " First, electroencephalography (EEG) is a medical test that measures electrical activity in the brain. Second, EEG is a type of electrodiagnostic test. Third, electrodiagnostic tests are used to evaluate neurological conditions.\n" + ] + } + ], + "source": [ + "print(test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}