Runpod•2y ago•

29 replies

Serverless Endpoint Streaming

I'm currently working with Llama.cpp for my inference and have setup my handler.py file to be similar to this guide.

https://docs.runpod.io/docs/handler-generator

My input and handler file looks like this:

{
    "input": {
      "query": "Temp",
      "stream":true
    }
}

{
    "input": {
      "query": "Temp",
      "stream":true
    }
}

{
    "input": {
      "query": "Temp",
      "stream":true
    }
}

{
    "input": {
      "query": "Temp",
      "stream":true
    }
}

import runpod
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain

def load_llm():

    n_gpu_layers = 200 
    n_batch = 500  

    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path="starling-lm-7b-alpha.Q4_K_M.gguf",
        n_gpu_layers=n_gpu_layers,
        use_mlock=True,
        use_mmap=True,
        max_tokens=1024,
        stop=["Q:","Disclaimer:","</s>","Source:","Legal Inquiry:","\n\n        ","Summary:"],
        n_batch=n_batch,
        temp= 0.5,
        n_ctx = 8192,
        repeat_penalty=1.18,
    )
    print("LLM Loaded! ;)")
    return llm


def process_input(input):
    """
    Execute the application code
    """
    llm = load_llm()
    query = input['query']

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
        GPT4 User:  {context}
        
        <|end_of_turn|>GPT4 Legal Assistant:
        """,
    )
    
    llmchain = LLMChain(llm = llm, prompt = prompt)
    answer = llmchain.run(query)

    return {
        "answer": answer
    }


# ---------------------------------------------------------------------------- #
#                                RunPod Handler                                #
# ---------------------------------------------------------------------------- #
def handler(event):
    """
    This is the handler function that will be called by RunPod serverless.
    """
    return process_input(event['input'])


if __name__ == '__main__':
    runpod.serverless.start({
        'handler': handler,
        "return_aggregate_stream": True
        })

import runpod
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain

def load_llm():

    n_gpu_layers = 200 
    n_batch = 500  

    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path="starling-lm-7b-alpha.Q4_K_M.gguf",
        n_gpu_layers=n_gpu_layers,
        use_mlock=True,
        use_mmap=True,
        max_tokens=1024,
        stop=["Q:","Disclaimer:","</s>","Source:","Legal Inquiry:","\n\n        ","Summary:"],
        n_batch=n_batch,
        temp= 0.5,
        n_ctx = 8192,
        repeat_penalty=1.18,
    )
    print("LLM Loaded! ;)")
    return llm


def process_input(input):
    """
    Execute the application code
    """
    llm = load_llm()
    query = input['query']

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
        GPT4 User:  {context}
        
        <|end_of_turn|>GPT4 Legal Assistant:
        """,
    )
    
    llmchain = LLMChain(llm = llm, prompt = prompt)
    answer = llmchain.run(query)

    return {
        "answer": answer
    }


# ---------------------------------------------------------------------------- #
#                                RunPod Handler                                #
# ---------------------------------------------------------------------------- #
def handler(event):
    """
    This is the handler function that will be called by RunPod serverless.
    """
    return process_input(event['input'])


if __name__ == '__main__':
    runpod.serverless.start({
        'handler': handler,
        "return_aggregate_stream": True
        })

import runpod
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain

def load_llm():

    n_gpu_layers = 200 
    n_batch = 500  

    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path="starling-lm-7b-alpha.Q4_K_M.gguf",
        n_gpu_layers=n_gpu_layers,
        use_mlock=True,
        use_mmap=True,
        max_tokens=1024,
        stop=["Q:","Disclaimer:","</s>","Source:","Legal Inquiry:","\n\n        ","Summary:"],
        n_batch=n_batch,
        temp= 0.5,
        n_ctx = 8192,
        repeat_penalty=1.18,
    )
    print("LLM Loaded! ;)")
    return llm


def process_input(input):
    """
    Execute the application code
    """
    llm = load_llm()
    query = input['query']

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
        GPT4 User:  {context}
        
        <|end_of_turn|>GPT4 Legal Assistant:
        """,
    )
    
    llmchain = LLMChain(llm = llm, prompt = prompt)
    answer = llmchain.run(query)

    return {
        "answer": answer
    }


# ---------------------------------------------------------------------------- #
#                                RunPod Handler                                #
# ---------------------------------------------------------------------------- #
def handler(event):
    """
    This is the handler function that will be called by RunPod serverless.
    """
    return process_input(event['input'])


if __name__ == '__main__':
    runpod.serverless.start({
        'handler': handler,
        "return_aggregate_stream": True
        })

import runpod
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain

def load_llm():

    n_gpu_layers = 200 
    n_batch = 500  

    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path="starling-lm-7b-alpha.Q4_K_M.gguf",
        n_gpu_layers=n_gpu_layers,
        use_mlock=True,
        use_mmap=True,
        max_tokens=1024,
        stop=["Q:","Disclaimer:","</s>","Source:","Legal Inquiry:","\n\n        ","Summary:"],
        n_batch=n_batch,
        temp= 0.5,
        n_ctx = 8192,
        repeat_penalty=1.18,
    )
    print("LLM Loaded! ;)")
    return llm


def process_input(input):
    """
    Execute the application code
    """
    llm = load_llm()
    query = input['query']

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
        GPT4 User:  {context}
        
        <|end_of_turn|>GPT4 Legal Assistant:
        """,
    )
    
    llmchain = LLMChain(llm = llm, prompt = prompt)
    answer = llmchain.run(query)

    return {
        "answer": answer
    }


# ---------------------------------------------------------------------------- #
#                                RunPod Handler                                #
# ---------------------------------------------------------------------------- #
def handler(event):
    """
    This is the handler function that will be called by RunPod serverless.
    """
    return process_input(event['input'])


if __name__ == '__main__':
    runpod.serverless.start({
        'handler': handler,
        "return_aggregate_stream": True
        })

My problem is that whenever I am testing this out in the requests tab on the dashboard, it keeps saying stream is empty.

https://github.com/runpod-workers/worker-vllm

RunPod

Generator Handler

A handler that can stream fractional results.

GitHub

GitHub - runpod-workers/worker-vllm: The RunPod worker template for...

The RunPod worker template for serving our large language model endpoints. Powered by VLLM. - GitHub - runpod-workers/worker-vllm: The RunPod worker template for serving our large language model en...

Justin Merrell•12/28/23, 6:28 PM

For streaming to work your handler needs to yield, take a look at https://github.com/runpod-workers/worker-vllm/blob/a247a3afe10a7d9002fb1f35971b7c5e29873950/src/handler.py#L13

JJustin Merrell For streaming to work your handler needs to yield, take a look at https://github...

ConceptOP•12/28/23, 6:28 PM

Thank you!

Justin Merrell•12/28/23, 6:29 PM

No problem, let me know if you run into any other issues

JJustin Merrell No problem, let me know if you run into any other issues 🙂

ConceptOP•12/28/23, 6:31 PM

I'm assuimg this wouldn't really work if I wanted to test locally because the output would just be async generator objects right?

CConcept I'm assuimg this wouldn't really work if I wanted to test locally because the ou...

Justin Merrell•12/28/23, 6:33 PM

How are you testing locally? If you are starting the API server with the

rp_serve_api

rp_serve_api

flag there is a stream test endpoint

ConceptOP•12/28/23, 6:33 PM

Just python rp_handler.py

Justin Merrell•12/28/23, 6:34 PM

Could you show your local test run to see what you are doing specifically?

ConceptOP•12/28/23, 6:36 PM

import runpod
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain

def load_llm():
    n_gpu_layers = 200  # Adjust based on your model and GPU VRAM pool.
    n_batch = 500  # Should be between 1 and n_ctx, considering your GPU VRAM.
    # Ensure the model path is correct for your system.
    llm = LlamaCpp(
        model_path="starling-lm-7b-alpha.Q4_K_M.gguf",
        n_gpu_layers=n_gpu_layers,
        use_mlock=True,
        use_mmap=True,
        max_tokens=1024,
        stop=["Q:","Disclaimer:","</s>","Source:","Legal Inquiry:","\n\n        ","Summary:"],
        n_batch=n_batch,
        temp=0.5,
        n_ctx=8192,
        repeat_penalty=1.18,
    )
    print("LLM Loaded! ;)")
    return llm

async def process_input(input):
    """
    Execute the application code with streaming support.
    """
    llm = load_llm()
    query = input['query']

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
        GPT4 User:  {context}
        
        GPT4 Legal Assistant:
        """,
    )
    
    llmchain = LLMChain(llm=llm, prompt=prompt)

    # Assuming llmchain.run supports asynchronous iteration for streaming.
    async for part in llmchain.run(query, stream=True):
        yield {"part": part}

# ---------------------------------------------------------------------------- #
#                                RunPod Handler                                #
# ---------------------------------------------------------------------------- #
async def handler(event):
    """
    Asynchronous handler function for RunPod serverless with streaming support.
    """
    async for output in process_input(event['input']):
        yield output

if __name__ == '__main__':
    runpod.serverless.start({
        'handler': handler,
        "return_aggregate_stream": True
    })

import runpod
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain

def load_llm():
    n_gpu_layers = 200  # Adjust based on your model and GPU VRAM pool.
    n_batch = 500  # Should be between 1 and n_ctx, considering your GPU VRAM.
    # Ensure the model path is correct for your system.
    llm = LlamaCpp(
        model_path="starling-lm-7b-alpha.Q4_K_M.gguf",
        n_gpu_layers=n_gpu_layers,
        use_mlock=True,
        use_mmap=True,
        max_tokens=1024,
        stop=["Q:","Disclaimer:","</s>","Source:","Legal Inquiry:","\n\n        ","Summary:"],
        n_batch=n_batch,
        temp=0.5,
        n_ctx=8192,
        repeat_penalty=1.18,
    )
    print("LLM Loaded! ;)")
    return llm

async def process_input(input):
    """
    Execute the application code with streaming support.
    """
    llm = load_llm()
    query = input['query']

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
        GPT4 User:  {context}
        
        GPT4 Legal Assistant:
        """,
    )
    
    llmchain = LLMChain(llm=llm, prompt=prompt)

    # Assuming llmchain.run supports asynchronous iteration for streaming.
    async for part in llmchain.run(query, stream=True):
        yield {"part": part}

# ---------------------------------------------------------------------------- #
#                                RunPod Handler                                #
# ---------------------------------------------------------------------------- #
async def handler(event):
    """
    Asynchronous handler function for RunPod serverless with streaming support.
    """
    async for output in process_input(event['input']):
        yield output

if __name__ == '__main__':
    runpod.serverless.start({
        'handler': handler,
        "return_aggregate_stream": True
    })

Not sure if I added yield correctly but below is my output:

--- Starting Serverless Worker | Version 1.4.0 ---
INFO | Using test_input.json as job input.
DEBUG | Retrieved local job: {'input': {'query': 'Temp', 'stream': True}, 'id': 'local_test'}
INFO | local_test | Started.
DEBUG | local_test | Handler output: <async_generator object handler at 0x7f83774a6440>
DEBUG | local_test | run_job return: {'output': <async_generator object handler at 0x7f83774a6440>}
INFO | Job local_test completed successfully.
INFO | Job result: {'output': <async_generator object handler at 0x7f83774a6440>}
INFO | Local testing complete, exiting.

Justin Merrell•12/28/23, 9:02 PM

I see, I will need to re-visit the testing for streaming. Calling your program with the

rp_serve_api

rp_serve_api

is going to be the current option for testing

JJustin Merrell I see, I will need to re-visit the testing for streaming. Calling your program w...

ConceptOP•12/28/23, 9:15 PM

Tried using it. /stream only returns 'detail not found'.

I should be expecting my result like this?:

{
  "delayTime": 1234,
  "executionTime": 1234,
  "id": "...",
  "output": [
    [
      {
        "text": " Run",
        "usage": {
          "input": 27,
          "output": 1
        }
      },
      {
        "text": "Pod",
        "usage": {
          "input": 27,
          "output": 2
        }
      },
      {
        "text": " is",
        "usage": {
          "input": 27,
          "output": 3
        }
      },
      {
        "text": " considered",
        "usage": {
          "input": 27,
          "output": 4
        }
      },
      {
        "text": " the",
        "usage": {
          "input": 27,
          "output": 5
        }
      },
      {
        "text": " best",
        "usage": {
          "input": 27,
          "output": 6
        }
      },
      {
        "text": " GPU",
        "usage": {
          "input": 27,
          "output": 7
        }
      },
      {
        "text": " provider",
        "usage": {
          "input": 27,
          "output": 8
        }
      },
      {
        "text": " for",
        "usage": {
          "input": 27,
          "output": 9
        }
      },
      {
        "text": " several",
        "usage": {
          "input": 27,
          "output": 10
        }
      }
    ]
  ],
  "status": "COMPLETED"
}

{
  "delayTime": 1234,
  "executionTime": 1234,
  "id": "...",
  "output": [
    [
      {
        "text": " Run",
        "usage": {
          "input": 27,
          "output": 1
        }
      },
      {
        "text": "Pod",
        "usage": {
          "input": 27,
          "output": 2
        }
      },
      {
        "text": " is",
        "usage": {
          "input": 27,
          "output": 3
        }
      },
      {
        "text": " considered",
        "usage": {
          "input": 27,
          "output": 4
        }
      },
      {
        "text": " the",
        "usage": {
          "input": 27,
          "output": 5
        }
      },
      {
        "text": " best",
        "usage": {
          "input": 27,
          "output": 6
        }
      },
      {
        "text": " GPU",
        "usage": {
          "input": 27,
          "output": 7
        }
      },
      {
        "text": " provider",
        "usage": {
          "input": 27,
          "output": 8
        }
      },
      {
        "text": " for",
        "usage": {
          "input": 27,
          "output": 9
        }
      },
      {
        "text": " several",
        "usage": {
          "input": 27,
          "output": 10
        }
      }
    ]
  ],
  "status": "COMPLETED"
}

CConcept Tried using it. /stream only returns 'detail not found'. I should be expecting...

Justin Merrell•12/28/23, 9:19 PM

Correct, could you provide a screenshot of your test? I'll double check that it is working as expected.

JJustin Merrell Correct, could you provide a screenshot of your test? I'll double check that it ...

ConceptOP•12/29/23, 2:30 PM

Hey Justin! I followed the readme above with the vllm runpod worker and got the streaming to work. One of my questions is that my delay time on this new endpoint is significantly longer than my original endpoint.

ConceptOP•12/29/23, 2:30 PM

Seems like adding more workers speeds it up?

Justin Merrell•12/29/23, 2:58 PM

When you sent the first request in were there any workers ready?

ConceptOP•12/29/23, 3:01 PM

I think it was a cold start. I'm struggling for the workers now to turn off after running.

ConceptOP•12/29/23, 3:01 PM

ConceptOP•12/29/23, 3:04 PM

Ah active workers

ConceptOP•12/29/23, 3:04 PM

I'm dumb.

ConceptOP•12/29/23, 3:05 PM

I turned on the active worker since the cold boot was 50 seconds haha

CConcept Hey Justin! I followed the readme above with the vllm runpod worker and got the ...

azeem5782•1/19/24, 6:05 AM

Could u please tell, how were u able to stream the output? What changes did you make in your previous code which already had "yield" for streaming and still was not working?

Aazeem5782 Could u please tell, how were u able to stream the output? What changes did you ...

ConceptOP•1/19/24, 6:06 AM

https://github.com/runpod-workers/worker-vllm

Check this out :)

GitHub

GitHub - runpod-workers/worker-vllm: The RunPod worker template for...

The RunPod worker template for serving our large language model endpoints. Powered by vLLM. - GitHub - runpod-workers/worker-vllm: The RunPod worker template for serving our large language model en...

CConcept https://github.com/runpod-workers/worker-vllm Check this out :)

azeem5782•1/19/24, 6:09 AM

I checked this repo, but wasn't able to figure out what was needed. Could u please be a little specific like what exactly helped in streaming apart from yield?

ConceptOP•1/19/24, 6:09 AM

I would fork this repo and adapt it to what you need.

ConceptOP•1/19/24, 6:09 AM

Use this worker to build/init your llm

ngagefreak05•6/24/24, 6:04 AM

Hi, could you please provide the url of your fork, need this too

CConcept I would fork this repo and adapt it to what you need.

Jason•6/24/24, 6:38 AM

sure thats why the build instructions are there in the readme.md file

ngagefreak05•6/24/24, 7:09 AM

sorry, may be i am reading the wrong readme.md at https://github.com/runpod-workers/worker-vllm/blob/main/README.md, can you please guide me with section or link (if the link is wrong)

digigoblin•6/24/24, 7:14 AM

This is the correct documentation

Serverless Endpoint Streaming

Similar Threads

Similar Threads

Similar Threads