Stream a chat response using Server-Sent Events. The response is streamed token-by-token, simulating LLM inference. Each token is sent as an SSE event with JSON data. Args: request: Chat request with prompt and optional thinking mode Returns: StreamingResponse with
(request: ChatRequest)
| 26 | |
| 27 | @app.post("/chat") |
| 28 | async def chat(request: ChatRequest): |
| 29 | """Stream a chat response using Server-Sent Events. |
| 30 | |
| 31 | The response is streamed token-by-token, simulating LLM inference. |
| 32 | Each token is sent as an SSE event with JSON data. |
| 33 | |
| 34 | Args: |
| 35 | request: Chat request with prompt and optional thinking mode |
| 36 | |
| 37 | Returns: |
| 38 | StreamingResponse with text/event-stream content type |
| 39 | """ |
| 40 | client = await get_dirty_client_async() |
| 41 | action = "generate_with_thinking" if request.thinking else "generate" |
| 42 | |
| 43 | async def stream(): |
| 44 | async for token in client.stream_async( |
| 45 | "streaming_chat.chat_app:ChatApp", |
| 46 | action, |
| 47 | request.prompt |
| 48 | ): |
| 49 | data = json.dumps({"token": token}) |
| 50 | yield f"data: {data}\n\n" |
| 51 | yield "data: [DONE]\n\n" |
| 52 | |
| 53 | return StreamingResponse( |
| 54 | stream(), |
| 55 | media_type="text/event-stream", |
| 56 | headers={ |
| 57 | "Cache-Control": "no-cache", |
| 58 | "Connection": "keep-alive", |
| 59 | "X-Accel-Buffering": "no", # Disable nginx buffering |
| 60 | } |
| 61 | ) |
| 62 | |
| 63 | |
| 64 | @app.post("/chat/sync", response_model=ChatResponse) |
nothing calls this directly
no test coverage detected