1
🔧 Cliente Gemini
Configuracao do cliente Google Generative AI com parametros otimizados.
import google.generativeai as genai
from app.config import settings
# Configuracao global
genai.configure(api_key=settings.GEMINI_API_KEY)
# Cliente com parametros padrao
model = genai.GenerativeModel(
model_name="gemini-1.5-pro",
generation_config={
"temperature": 0.7,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
},
safety_settings={
"HARM_CATEGORY_HARASSMENT": "BLOCK_NONE",
"HARM_CATEGORY_HATE_SPEECH": "BLOCK_NONE",
}
)
2
📝 Gerenciamento de Prompts
Sistema de templates de prompts com variaveis e versionamento.
# prompts/chat_template.py
CHAT_SYSTEM_PROMPT = """
Voce e um assistente especializado em analise de documentos.
CONTEXTO DO NOTEBOOK:
{notebook_context}
FONTES DISPONIVEIS:
{sources_summary}
PERSONA ATIVA: {persona}
REGRAS:
1. Responda APENAS com base nas fontes fornecidas
2. Cite a fonte quando usar informacao especifica
3. Se nao souber, diga explicitamente
4. Mantenha o tom profissional
"""
def build_prompt(context: dict) -> str:
return CHAT_SYSTEM_PROMPT.format(**context)
3
📊 Contagem de Tokens
Estimativa e tracking de tokens para controle de custos.
Contagem Pre-Request
def count_tokens(text: str) -> int:
"""Conta tokens antes do request"""
return model.count_tokens(text).total_tokens
# Verificacao de limite
if count_tokens(prompt) > 30000:
raise TokenLimitError(
"Prompt excede limite"
)
Tracking de Custos
PRICING = {
"gemini-1.5-pro": {
"input": 0.0035 / 1000,
"output": 0.0105 / 1000
}
}
def calculate_cost(usage):
return (
usage.input * PRICING[model]["input"] +
usage.output * PRICING[model]["output"]
)
4
🔄 Streaming de Respostas
Implementacao de streaming para respostas em tempo real.
async def stream_chat(prompt: str):
"""Gera resposta em streaming"""
response = await model.generate_content_async(
prompt,
stream=True
)
async for chunk in response:
if chunk.text:
yield {
"type": "chunk",
"content": chunk.text
}
# Metadados finais
yield {
"type": "done",
"tokens_used": response.usage_metadata.total_token_count,
"finish_reason": response.candidates[0].finish_reason
}
5
🛡️ Retry e Rate Limiting
Tratamento de erros e controle de taxa de requisicoes.
from tenacity import retry, stop_after_attempt, wait_exponential
import asyncio
# Rate limiter
rate_limiter = asyncio.Semaphore(10) # 10 requests simultaneos
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=60),
reraise=True
)
async def call_gemini(prompt: str):
async with rate_limiter:
try:
return await model.generate_content_async(prompt)
except Exception as e:
if "429" in str(e):
await asyncio.sleep(60)
raise
6
📋 Service Layer Completo
Servico que encapsula toda a logica de integracao com a IA.
class GeminiService:
def __init__(self):
self.model = genai.GenerativeModel("gemini-1.5-pro")
self.rate_limiter = asyncio.Semaphore(10)
async def chat(
self,
message: str,
context: NotebookContext,
persona: str = "professional"
) -> ChatResponse:
# 1. Monta prompt
prompt = build_prompt({
"message": message,
"notebook_context": context.summary,
"sources_summary": context.sources,
"persona": persona
})
# 2. Valida tokens
tokens = self.model.count_tokens(prompt).total_tokens
if tokens > 30000:
raise TokenLimitError(tokens)
# 3. Executa com retry
response = await self._call_with_retry(prompt)
# 4. Retorna estruturado
return ChatResponse(
response=response.text,
tokens_used=response.usage_metadata.total_token_count,
cost_usd=calculate_cost(response.usage_metadata)
)
📝 Resumo do Modulo
✓Cliente - Configuracao otimizada do Gemini
✓Prompts - Templates versionados e dinamicos
✓Tokens - Contagem e controle de custos
✓Service - Camada encapsulada e resiliente