RunpodR
Runpod2y ago
6 replies
ngagefreak05

cannot stream openai compatible response out

I have the below code for streaming the response, the generator is working but cannnot stream the response:

llm = Llama(model_path="Phi-3-mini-4k-instruct-q4.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)

class JobInput:
def init(self, job):
self.openai_route = job.get("openai_route")
self.openai_input = job.get("openai_input", {})
self.is_completion = "v1completions" in self.openai_route
self.is_embedding = "embeddings" in self.openai_route
self.embedding_format = self.openai_input.get('encoding_format', 'unknown')
self.is_chatcompletion = "chat" in self.openai_route


def infer(job_params):
if 'n' in job_params.openai_input:
del job_params.openai_input['n']
if job_params.openai_route and job_params.is_embedding:
yield [ErrorResponse(
message="The embedding endpoint is not supported on this URL.",
type="unsupported_endpoint",
code=501 # Not Implemented
).model_dump()]
else:
if job_params.openai_route and job_params.is_chatcompletion:
llm_engine = llm.create_chat_completion
else:
llm_engine = llm.create_completion

if not job_params.openai_input.get("stream", False):
yield llm_engine(job_params.openai_input)
elif job_params.openai_input.get("stream", False):
llm_op = llm_engine(
job_params.openai_input)
yield llm_op

async def handler(event):
inp = event["input"]
job_input = JobInput(inp)
for line in infer(job_input):
if isinstance(line, Generator):
for l in line:
yield l
else:
yield line

if name == "main":
runpod.serverless.start({"handler": handler,
"return_aggregate_stream": True,})


Need help to fix!
Was this page helpful?