Comprehensive Python examples for integrating Inferno AI into your applications.
pip install openai
from openai import OpenAI
# Initialize client
client = OpenAI(
base_url="http://localhost:8080/v1",
api_key="not-needed" # or your API key if authentication is enabled
)
# Create chat completion
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is Python?"}
]
)
# Print response
print(response.choices[0].message.content)
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[
{"role": "user", "content": "Write a poem about AI"}
],
temperature=0.8, # More creative
max_tokens=200, # Limit response length
top_p=0.9,
seed=42 # Reproducible results
)
print(response.choices[0].message.content)
stream = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": "Tell me a story"}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print() # New line at end
try:
stream = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": "Count to 10"}],
stream=True
)
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
print(content, end="", flush=True)
except Exception as e:
print(f"Error: {e}")
import asyncio
from openai import AsyncOpenAI
async def main():
client = AsyncOpenAI(
base_url="http://localhost:8080/v1",
api_key="not-needed"
)
response = await client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": "Hello!"}]
)
print(response.choices[0].message.content)
# Run
asyncio.run(main())
async def stream_response():
client = AsyncOpenAI(
base_url="http://localhost:8080/v1",
api_key="not-needed"
)
stream = await client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": "Tell me a joke"}],
stream=True
)
async for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
asyncio.run(stream_response())
async def concurrent_requests():
client = AsyncOpenAI(
base_url="http://localhost:8080/v1",
api_key="not-needed"
)
prompts = [
"What is AI?",
"Explain machine learning",
"What is deep learning?"
]
# Create tasks
tasks = [
client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": prompt}]
)
for prompt in prompts
]
# Run concurrently
responses = await asyncio.gather(*tasks)
for i, response in enumerate(responses):
print(f"\nPrompt {i+1}: {prompts[i]}")
print(f"Response: {response.choices[0].message.content}")
asyncio.run(concurrent_requests())
messages = [
{"role": "system", "content": "You are a helpful assistant."}
]
def chat(user_message):
# Add user message
messages.append({"role": "user", "content": user_message})
# Get response
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=messages
)
# Add assistant response
assistant_message = response.choices[0].message.content
messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
# Conversation
print(chat("Hello, my name is Alice"))
print(chat("What's my name?")) # Should remember "Alice"
print(chat("Tell me a fun fact"))
def truncate_messages(messages, max_tokens=2048):
"""Keep only recent messages that fit in context window"""
# Simple approach: keep last N messages
return messages[-10:] # Keep last 10 messages
messages = truncate_messages(messages)
from openai import OpenAI, OpenAIError
import time
client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
def chat_with_retry(prompt, max_retries=3):
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": prompt}],
timeout=30.0
)
return response.choices[0].message.content
except OpenAIError as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
raise
except Exception as e:
print(f"Unexpected error: {e}")
raise
# Use it
try:
result = chat_with_retry("Hello!")
print(result)
except Exception as e:
print(f"Failed after retries: {e}")
prompts = [
"Summarize: Python is a programming language...",
"Translate to French: Hello, how are you?",
"Generate code: Fibonacci sequence function"
]
results = []
for prompt in prompts:
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": prompt}]
)
results.append(response.choices[0].message.content)
for i, result in enumerate(results):
print(f"\nPrompt {i+1}: {prompts[i][:50]}...")
print(f"Response: {result[:100]}...")
from concurrent.futures import ThreadPoolExecutor
def process_prompt(prompt):
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
prompts = ["Prompt 1", "Prompt 2", "Prompt 3", "Prompt 4"]
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(process_prompt, prompts))
for prompt, result in zip(prompts, results):
print(f"{prompt}: {result}")
from flask import Flask, request, jsonify, stream_with_context, Response
from openai import OpenAI
app = Flask(__name__)
client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
@app.route('/api/chat', methods=['POST'])
def chat():
data = request.json
message = data.get('message')
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": message}]
)
return jsonify({
"response": response.choices[0].message.content
})
@app.route('/api/chat/stream', methods=['POST'])
def chat_stream():
data = request.json
message = data.get('message')
def generate():
stream = client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": message}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
yield f"data: {chunk.choices[0].delta.content}\n\n"
return Response(stream_with_context(generate()),
mimetype='text/event-stream')
if __name__ == '__main__':
app.run(port=5000)
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from openai import AsyncOpenAI
from fastapi.responses import StreamingResponse
app = FastAPI()
client = AsyncOpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
class ChatRequest(BaseModel):
message: str
stream: bool = False
@app.post("/chat")
async def chat(request: ChatRequest):
try:
if request.stream:
async def generate():
stream = await client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": request.message}],
stream=True
)
async for chunk in stream:
if chunk.choices[0].delta.content:
yield f"data: {chunk.choices[0].delta.content}\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
response = await client.chat.completions.create(
model="llama-2-7b-chat",
messages=[{"role": "user", "content": request.message}]
)
return {"response": response.choices[0].message.content}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
#!/usr/bin/env python3
from openai import OpenAI
import sys
client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
messages = [
{"role": "system", "content": "You are a helpful assistant."}
]
def chat(user_input):
messages.append({"role": "user", "content": user_input})
response = client.chat.completions.create(
model="llama-2-7b-chat",
messages=messages
)
assistant_message = response.choices[0].message.content
messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
def main():
print("Chatbot (type 'exit' to quit)")
print("-" * 40)
while True:
try:
user_input = input("\nYou: ").strip()
if not user_input:
continue
if user_input.lower() in ['exit', 'quit']:
print("Goodbye!")
break
response = chat(user_input)
print(f"\nAssistant: {response}")
except KeyboardInterrupt:
print("\n\nGoodbye!")
sys.exit(0)
except Exception as e:
print(f"\nError: {e}")
if __name__ == "__main__":
main()