© 2026 Hedgehog Software, LLC
{ "input": { "query": "Temp", "stream":true } }
import runpod from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.prompts import PromptTemplate from langchain.llms import LlamaCpp from langchain.chains import LLMChain def load_llm(): n_gpu_layers = 200 n_batch = 500 # Make sure the model path is correct for your system! llm = LlamaCpp( model_path="starling-lm-7b-alpha.Q4_K_M.gguf", n_gpu_layers=n_gpu_layers, use_mlock=True, use_mmap=True, max_tokens=1024, stop=["Q:","Disclaimer:","</s>","Source:","Legal Inquiry:","\n\n ","Summary:"], n_batch=n_batch, temp= 0.5, n_ctx = 8192, repeat_penalty=1.18, ) print("LLM Loaded! ;)") return llm def process_input(input): """ Execute the application code """ llm = load_llm() query = input['query'] prompt = PromptTemplate( input_variables=["context"], template=""" GPT4 User: {context} <|end_of_turn|>GPT4 Legal Assistant: """, ) llmchain = LLMChain(llm = llm, prompt = prompt) answer = llmchain.run(query) return { "answer": answer } # ---------------------------------------------------------------------------- # # RunPod Handler # # ---------------------------------------------------------------------------- # def handler(event): """ This is the handler function that will be called by RunPod serverless. """ return process_input(event['input']) if __name__ == '__main__': runpod.serverless.start({ 'handler': handler, "return_aggregate_stream": True })