Skip to Content

Python Integration

Comprehensive Python examples for integrating Inferno AI into your applications.

Installation

pip install openai

Basic Usage

Simple Chat Completion

from openai import OpenAI
 
# Initialize client
client = OpenAI(
    base_url="http://localhost:8080/v1",
    api_key="not-needed"  # or your API key if authentication is enabled
)
 
# Create chat completion
response = client.chat.completions.create(
    model="llama-2-7b-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is Python?"}
    ]
)
 
# Print response
print(response.choices[0].message.content)

With Parameters

response = client.chat.completions.create(
    model="llama-2-7b-chat",
    messages=[
        {"role": "user", "content": "Write a poem about AI"}
    ],
    temperature=0.8,      # More creative
    max_tokens=200,       # Limit response length
    top_p=0.9,
    seed=42              # Reproducible results
)
 
print(response.choices[0].message.content)

Streaming Responses

Basic Streaming

stream = client.chat.completions.create(
    model="llama-2-7b-chat",
    messages=[{"role": "user", "content": "Tell me a story"}],
    stream=True
)
 
for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)
 
print()  # New line at end

Streaming with Error Handling

try:
    stream = client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=[{"role": "user", "content": "Count to 10"}],
        stream=True
    )
 
    for chunk in stream:
        content = chunk.choices[0].delta.content
        if content:
            print(content, end="", flush=True)
 
except Exception as e:
    print(f"Error: {e}")

Async/Await

Async Client

import asyncio
from openai import AsyncOpenAI
 
async def main():
    client = AsyncOpenAI(
        base_url="http://localhost:8080/v1",
        api_key="not-needed"
    )
 
    response = await client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=[{"role": "user", "content": "Hello!"}]
    )
 
    print(response.choices[0].message.content)
 
# Run
asyncio.run(main())

Async Streaming

async def stream_response():
    client = AsyncOpenAI(
        base_url="http://localhost:8080/v1",
        api_key="not-needed"
    )
 
    stream = await client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=[{"role": "user", "content": "Tell me a joke"}],
        stream=True
    )
 
    async for chunk in stream:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="", flush=True)
 
asyncio.run(stream_response())

Concurrent Requests

async def concurrent_requests():
    client = AsyncOpenAI(
        base_url="http://localhost:8080/v1",
        api_key="not-needed"
    )
 
    prompts = [
        "What is AI?",
        "Explain machine learning",
        "What is deep learning?"
    ]
 
    # Create tasks
    tasks = [
        client.chat.completions.create(
            model="llama-2-7b-chat",
            messages=[{"role": "user", "content": prompt}]
        )
        for prompt in prompts
    ]
 
    # Run concurrently
    responses = await asyncio.gather(*tasks)
 
    for i, response in enumerate(responses):
        print(f"\nPrompt {i+1}: {prompts[i]}")
        print(f"Response: {response.choices[0].message.content}")
 
asyncio.run(concurrent_requests())

Context Management

Multi-turn Conversation

messages = [
    {"role": "system", "content": "You are a helpful assistant."}
]
 
def chat(user_message):
    # Add user message
    messages.append({"role": "user", "content": user_message})
 
    # Get response
    response = client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=messages
    )
 
    # Add assistant response
    assistant_message = response.choices[0].message.content
    messages.append({"role": "assistant", "content": assistant_message})
 
    return assistant_message
 
# Conversation
print(chat("Hello, my name is Alice"))
print(chat("What's my name?"))  # Should remember "Alice"
print(chat("Tell me a fun fact"))

Context Window Management

def truncate_messages(messages, max_tokens=2048):
    """Keep only recent messages that fit in context window"""
    # Simple approach: keep last N messages
    return messages[-10:]  # Keep last 10 messages
 
messages = truncate_messages(messages)

Error Handling

Comprehensive Error Handling

from openai import OpenAI, OpenAIError
import time
 
client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
 
def chat_with_retry(prompt, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="llama-2-7b-chat",
                messages=[{"role": "user", "content": prompt}],
                timeout=30.0
            )
            return response.choices[0].message.content
 
        except OpenAIError as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                raise
 
        except Exception as e:
            print(f"Unexpected error: {e}")
            raise
 
# Use it
try:
    result = chat_with_retry("Hello!")
    print(result)
except Exception as e:
    print(f"Failed after retries: {e}")

Batch Processing

Process Multiple Prompts

prompts = [
    "Summarize: Python is a programming language...",
    "Translate to French: Hello, how are you?",
    "Generate code: Fibonacci sequence function"
]
 
results = []
 
for prompt in prompts:
    response = client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=[{"role": "user", "content": prompt}]
    )
    results.append(response.choices[0].message.content)
 
for i, result in enumerate(results):
    print(f"\nPrompt {i+1}: {prompts[i][:50]}...")
    print(f"Response: {result[:100]}...")

Parallel Processing

from concurrent.futures import ThreadPoolExecutor
 
def process_prompt(prompt):
    response = client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content
 
prompts = ["Prompt 1", "Prompt 2", "Prompt 3", "Prompt 4"]
 
with ThreadPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(process_prompt, prompts))
 
for prompt, result in zip(prompts, results):
    print(f"{prompt}: {result}")

Flask Web App

Simple Web API

from flask import Flask, request, jsonify, stream_with_context, Response
from openai import OpenAI
 
app = Flask(__name__)
client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
 
@app.route('/api/chat', methods=['POST'])
def chat():
    data = request.json
    message = data.get('message')
 
    response = client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=[{"role": "user", "content": message}]
    )
 
    return jsonify({
        "response": response.choices[0].message.content
    })
 
@app.route('/api/chat/stream', methods=['POST'])
def chat_stream():
    data = request.json
    message = data.get('message')
 
    def generate():
        stream = client.chat.completions.create(
            model="llama-2-7b-chat",
            messages=[{"role": "user", "content": message}],
            stream=True
        )
 
        for chunk in stream:
            if chunk.choices[0].delta.content:
                yield f"data: {chunk.choices[0].delta.content}\n\n"
 
    return Response(stream_with_context(generate()),
                    mimetype='text/event-stream')
 
if __name__ == '__main__':
    app.run(port=5000)

FastAPI Example

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from openai import AsyncOpenAI
from fastapi.responses import StreamingResponse
 
app = FastAPI()
client = AsyncOpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
 
class ChatRequest(BaseModel):
    message: str
    stream: bool = False
 
@app.post("/chat")
async def chat(request: ChatRequest):
    try:
        if request.stream:
            async def generate():
                stream = await client.chat.completions.create(
                    model="llama-2-7b-chat",
                    messages=[{"role": "user", "content": request.message}],
                    stream=True
                )
 
                async for chunk in stream:
                    if chunk.choices[0].delta.content:
                        yield f"data: {chunk.choices[0].delta.content}\n\n"
 
            return StreamingResponse(generate(), media_type="text/event-stream")
 
        else:
            response = await client.chat.completions.create(
                model="llama-2-7b-chat",
                messages=[{"role": "user", "content": request.message}]
            )
 
            return {"response": response.choices[0].message.content}
 
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
 
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

Command-Line Chatbot

#!/usr/bin/env python3
from openai import OpenAI
import sys
 
client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
 
messages = [
    {"role": "system", "content": "You are a helpful assistant."}
]
 
def chat(user_input):
    messages.append({"role": "user", "content": user_input})
 
    response = client.chat.completions.create(
        model="llama-2-7b-chat",
        messages=messages
    )
 
    assistant_message = response.choices[0].message.content
    messages.append({"role": "assistant", "content": assistant_message})
 
    return assistant_message
 
def main():
    print("Chatbot (type 'exit' to quit)")
    print("-" * 40)
 
    while True:
        try:
            user_input = input("\nYou: ").strip()
 
            if not user_input:
                continue
 
            if user_input.lower() in ['exit', 'quit']:
                print("Goodbye!")
                break
 
            response = chat(user_input)
            print(f"\nAssistant: {response}")
 
        except KeyboardInterrupt:
            print("\n\nGoodbye!")
            sys.exit(0)
        except Exception as e:
            print(f"\nError: {e}")
 
if __name__ == "__main__":
    main()

Next Steps