gpt-oss-120b-chat

Running

App Files Files Community

gpt-oss-120b-chat / app.py

bradnow

Update for local dev

4d9f591 8 months ago

raw

history blame

5.67 kB

	import os
	import sys
	import datetime

	from openai import OpenAI
	import gradio as gr
	from gradio.components.chatbot import ChatMessage, Message
	from typing import (
	Any,
	Literal,
	)

	DEBUG_LOG = False or os.environ.get("DEBUG_LOG") == "True"

	print(f"Gradio version: {gr.__version__}")

	title = None # "ServiceNow-AI Chat" # modelConfig.get('MODE_DISPLAY_NAME')
	description = None

	chat_start_count = 0

	model_config = {
	"MODEL_NAME": os.environ.get("MODEL_NAME"),
	"MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
	"MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
	"VLLM_API_URL": os.environ.get("VLLM_API_URL"),
	"AUTH_TOKEN": os.environ.get("AUTH_TOKEN")
	}

	# Initialize the OpenAI client with the vLLM API URL and token
	client = OpenAI(
	api_key=model_config.get('AUTH_TOKEN'),
	base_url=model_config.get('VLLM_API_URL')
	)


	def log_message(message):
	if DEBUG_LOG is True:
	print(message)


	# Gradio 5.0.1 had issues with checking the message formats. 5.29.0 does not!
	def _check_format(messages: Any, type: Literal["messages", "tuples"] = "messages") -> None:
	if type == "messages":
	all_valid = all(
	isinstance(message, dict)
	and "role" in message
	and "content" in message
	or isinstance(message, ChatMessage \| Message)
	for message in messages
	)
	if not all_valid:
	# Display which message is not valid
	for i, message in enumerate(messages):
	if not (isinstance(message, dict) and
	"role" in message and
	"content" in message) and not isinstance(message, ChatMessage \| Message):
	print(f"_check_format() --> Invalid message at index {i}: {message}\n", file=sys.stderr)
	break

	raise Exception(
	"Data incompatible with messages format. Each message should be a dictionary with 'role' and 'content' keys or a ChatMessage object."
	)
	# else:
	# print("_check_format() --> All messages are valid.")
	elif not all(
	isinstance(message, (tuple, list)) and len(message) == 2
	for message in messages
	):
	raise Exception(
	"Data incompatible with tuples format. Each message should be a list of length 2."
	)


	def chat_fn(message, history):
	log_message(f"{'-' * 80}\nchat_fn() --> Message: {message}")

	global chat_start_count
	chat_start_count = chat_start_count + 1
	print(
	f"{datetime.datetime.now()}: chat_start_count: {chat_start_count}, turns: {int(len(history if history else []) / 3)}")

	# Remove any assistant messages with metadata from history for multiple turns
	log_message(f"Original History: {history}")
	_check_format(history, "messages")
	history = [item for item in history if
	not (isinstance(item, dict) and
	item.get("role") == "assistant" and
	isinstance(item.get("metadata"), dict) and
	item.get("metadata", {}).get("title") is not None)]
	log_message(f"Updated History: {history}")
	_check_format(history, "messages")

	history.append({"role": "user", "content": message})
	log_message(f"History with user message: {history}")
	_check_format(history, "messages")

	# Create the streaming response
	stream = client.chat.completions.create(
	model=model_config.get('MODEL_NAME'),
	messages=history,
	temperature=0.8,
	stream=True
	)

	history.append(gr.ChatMessage(
	role="assistant",
	content="Thinking...",
	metadata={"title": "🧠 Thought"}
	))
	log_message(f"History added thinking: {history}")
	_check_format(history, "messages")

	output = ""
	completion_started = False
	for chunk in stream:
	# Extract the new content from the delta field
	content = getattr(chunk.choices[0].delta, "content", "")
	output += content

	parts = output.split("[BEGIN FINAL RESPONSE]")

	if len(parts) > 1:
	if parts[1].endswith("[END FINAL RESPONSE]"):
	parts[1] = parts[1].replace("[END FINAL RESPONSE]", "")
	if parts[1].endswith("[END FINAL RESPONSE]\n<\|end\|>"):
	parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<\|end\|>", "")

	history[-1 if not completion_started else -2] = gr.ChatMessage(
	role="assistant",
	content=parts[0],
	metadata={"title": "🧠 Thought"}
	)
	if completion_started:
	history[-1] = gr.ChatMessage(
	role="assistant",
	content=parts[1]
	)
	elif len(parts) > 1 and not completion_started:
	completion_started = True
	history.append(gr.ChatMessage(
	role="assistant",
	content=parts[1]
	))

	# only yield the most recent assistant messages
	messages_to_yield = history[-1:] if not completion_started else history[-2:]
	# _check_format(messages_to_yield, "messages")
	yield messages_to_yield

	log_message(f"Final History: {history}")
	_check_format(history, "messages")


	# Add the model display name and Hugging Face URL to the description
	# description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"

	print(f"Running model {model_config.get('MODE_DISPLAY_NAME')} ({model_config.get('MODEL_NAME')})")

	gr.ChatInterface(
	chat_fn,
	title=title,
	description=description,
	theme=gr.themes.Default(primary_hue="green"),
	type="messages",
	).launch()