"""AI-powered insights using OpenAI GPT-4."""
import os
from typing import Dict, List, Any, Optional
import pandas as pd
from openai import OpenAI
from ..utils.errors import ProcessingError
from ..utils.logging import get_logger
logger = get_logger(__name__)
[docs]
class AIInsightEngine:
"""Generate AI-powered insights using GPT-4."""
def __init__(self, api_key: Optional[str] = None):
"""
Initialize AI insight engine.
Args:
api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
"""
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
logger.warning("OpenAI API key not found - AI insights will be disabled")
self.client = None
else:
self.client = OpenAI(api_key=self.api_key)
logger.info("AI insight engine initialized")
[docs]
def generate_executive_summary(
self,
df: pd.DataFrame,
period_metrics: pd.DataFrame,
market_concentration: Dict[str, Any]
) -> str:
"""
Generate executive summary of findings.
Args:
df: Full time series DataFrame
period_metrics: Aggregate period metrics
market_concentration: Market concentration metrics
Returns:
Executive summary text
"""
if not self.client:
return self._fallback_executive_summary(period_metrics)
try:
# Prepare data summary
data_summary = self._prepare_data_summary(df, period_metrics, market_concentration)
prompt = f"""You are a statistical analyst with strict scientific integrity. Analyze this Share of Search data.
DATA QUALITY NOTICE:
- Google Trends uses undisclosed sampling methods (Choi & Varian 2012)
- Same query on different days shows correlation of only 0.79-0.94 (Cebrián & Domenech 2023)
- Documented measurement error: ±5% variability between retrievals
- Coverage bias: excludes non-Google users and specialized platforms
Data Summary:
{data_summary}
Provide a 3-paragraph statistical summary:
1. Market Position Analysis:
- Report shares with ±5% measurement error acknowledgment
- Use correlation language, not causal claims
- State "observed patterns" not "explanations"
2. Temporal Patterns:
- Report statistical trends (direction, correlation coefficients where applicable)
- Acknowledge alternative explanations
- Use "correlated with" not "caused by"
3. Statistical Context:
- Market concentration metrics
- Volatility comparisons
- Data limitations that affect interpretation
CONSTRAINTS:
- NEVER claim causation without experimental evidence
- ALWAYS quantify uncertainty
- Use statistical language throughout
- Acknowledge data limitations explicitly"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a rigorous statistical analyst. You NEVER make causal claims without experimental evidence. You ALWAYS acknowledge measurement error and data limitations."},
{"role": "user", "content": prompt}
],
temperature=0.3,
max_tokens=600
)
summary = response.choices[0].message.content.strip()
logger.info("Generated AI executive summary")
return summary
except Exception as e:
logger.warning(f"AI summary generation failed: {e}")
return self._fallback_executive_summary(period_metrics)
[docs]
def explain_anomalies(
self,
df: pd.DataFrame,
anomalies: pd.DataFrame
) -> List[Dict[str, str]]:
"""
Generate statistical descriptions of detected anomalies.
Args:
df: Full DataFrame
anomalies: DataFrame of anomalous points
Returns:
List of anomaly descriptions
"""
if not self.client or anomalies.empty:
return []
try:
explanations = []
# Limit to top 3 anomalies
top_anomalies = anomalies.nlargest(3, 'z_score')
for _, anomaly in top_anomalies.iterrows():
query = anomaly['query']
date = anomaly['date'].strftime('%Y-%m-%d')
value = anomaly['share_of_search']
z_score = anomaly['z_score']
# Get context (surrounding data)
query_data = df[df['query'] == query].sort_values('date')
avg_value = query_data['share_of_search'].mean()
deviation_pct = ((value - avg_value) / avg_value) * 100
prompt = f"""STATISTICAL OBSERVATION: Anomaly detected in Share of Search data.
Brand: {query}
Date: {date}
Observed Share: {value:.1f}%
Average Share: {avg_value:.1f}%
Deviation: {deviation_pct:+.1f}%
Z-score: {z_score:.2f}
DATA QUALITY NOTICE: Google Trends has ±5% measurement error (Cebrián & Domenech 2023).
Provide a 2-3 sentence statistical description:
1. Quantify the deviation in statistical terms
2. Note that causation cannot be determined from this data alone
3. List 2-3 possible explanations that would require independent verification
CONSTRAINTS:
- Use "correlated with" or "associated with" not "caused by"
- State "requires external validation" for any causal hypothesis
- Acknowledge measurement error could explain part or all of the deviation
- Do NOT make definitive causal claims"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[{"role": "system", "content": "You are a statistical analyst. You describe observations without claiming causation. You ALWAYS acknowledge measurement error and the need for external validation."},
{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=200
)
explanation = response.choices[0].message.content.strip()
explanations.append({
"query": query,
"date": date,
"value": f"{value:.1f}%",
"explanation": explanation
})
logger.info(f"Generated {len(explanations)} anomaly descriptions")
return explanations
except Exception as e:
logger.warning(f"Anomaly description failed: {e}")
return []
[docs]
def generate_competitive_insights(
self,
period_metrics: pd.DataFrame
) -> str:
"""
Generate statistical competitive analysis.
Args:
period_metrics: Aggregate metrics per brand
Returns:
Statistical competitive analysis text
"""
if not self.client:
return ""
try:
# Prepare metrics summary
metrics_text = period_metrics.to_string()
prompt = f"""Analyze these Share of Search metrics with statistical rigor:
DATA QUALITY NOTICE:
- Google Trends measurement error: ±5% (Cebrián & Domenech 2023)
- Share values should be interpreted with this uncertainty
- Correlations do not imply causation
{metrics_text}
Provide statistical analysis in 4 sections:
1. RELATIVE POSITIONING
- Report rank order with measurement uncertainty
- State "Brand X observed at Y% ± 5%" not "Brand X dominates"
- Use statistical language
2. VOLATILITY PATTERNS
- Compare volatility metrics quantitatively
- Higher volatility = less predictable search patterns
- Acknowledge this is descriptive, not prescriptive
3. OBSERVED CORRELATIONS
- Note any patterns in the data
- Use "correlated with" or "associated with"
- Do NOT claim these explain performance differences
4. DATA LIMITATIONS
- Remind that this is search data only, not market performance
- Note coverage biases (excludes non-Google users)
- Strategic decisions require additional data sources
CONSTRAINTS:
- NO causal claims without experimental evidence
- NO strategic recommendations (requires external data)
- Use statistical language throughout
- Acknowledge uncertainty explicitly"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[{"role": "system", "content": "You are a statistical analyst. You describe patterns in data without making causal or strategic claims. You ALWAYS acknowledge measurement error and data limitations."},
{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=700
)
insights = response.choices[0].message.content.strip()
logger.info("Generated competitive statistical analysis")
return insights
except Exception as e:
logger.warning(f"Competitive analysis generation failed: {e}")
return ""
[docs]
def generate_recommendations(
self,
df: pd.DataFrame,
period_metrics: pd.DataFrame
) -> Dict[str, str]:
"""
Generate statistical profiles per brand (NOT strategic recommendations).
Args:
df: Full time series
period_metrics: Aggregate metrics
Returns:
Dictionary mapping brand to statistical profile
"""
if not self.client:
return {}
try:
profiles = {}
for _, row in period_metrics.iterrows():
brand = row['query']
avg_share = row['avg_share']
volatility = row.get('volatility', 0)
# Get trend info
brand_data = df[df['query'] == brand]
trend = brand_data['trend_direction'].iloc[-1] if len(brand_data) > 0 else 'stable'
prompt = f"""Generate a statistical profile for this brand (NOT recommendations):
Brand: {brand}
Average Share of Search: {avg_share:.1f}% (±5% measurement error)
Trend Direction: {trend}
Volatility: {volatility:.2f}
DATA QUALITY NOTICE: Google Trends has ±5% measurement error (Cebrián & Domenech 2023)
Provide a 3-point statistical profile:
1. RELATIVE POSITION
- Where this brand sits relative to competitors in the dataset
- Note measurement uncertainty
2. TEMPORAL PATTERN
- Describe the trend statistically (direction, approximate slope)
- State "observed correlation with time" not "performance improvement"
3. VARIABILITY CONTEXT
- Interpret volatility value
- Higher volatility = less predictable search patterns
- This is descriptive only
CONSTRAINTS:
- NO strategic recommendations (requires domain expertise + external data)
- NO causal claims
- Statistical description only
- Acknowledge this is search data, not market performance
- End with: "Strategic decisions require additional data sources beyond Google Trends"
"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[{"role": "system", "content": "You are a statistical analyst. You provide statistical descriptions ONLY, never strategic advice. You ALWAYS acknowledge measurement error and data limitations."},
{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=2000
)
profiles[brand] = response.choices[0].message.content.strip()
logger.info(f"Generated statistical profiles for {len(profiles)} brands")
return profiles
except Exception as e:
logger.warning(f"Statistical profile generation failed: {e}")
return {}
def _prepare_data_summary(
self,
df: pd.DataFrame,
period_metrics: pd.DataFrame,
market_concentration: Dict[str, Any]
) -> str:
"""Prepare data summary for AI analysis."""
summary_parts = []
# Period info
start_date = df['date'].min().strftime('%Y-%m-%d')
end_date = df['date'].max().strftime('%Y-%m-%d')
summary_parts.append(f"Period: {start_date} to {end_date}")
# Market concentration
hhi = market_concentration.get('hhi', 0)
concentration = market_concentration.get('concentration', 'unknown')
summary_parts.append(f"\nMarket Concentration: HHI={hhi:.0f} ({concentration})")
# Brand metrics
summary_parts.append("\nBrand Performance:")
for _, row in period_metrics.iterrows():
brand = row['query']
avg_share = row['avg_share']
volatility = row.get('volatility', 0)
summary_parts.append(f"- {brand}: {avg_share:.1f}% avg share, volatility={volatility:.2f}")
# Trends
summary_parts.append("\nTrends:")
for query in df['query'].unique():
query_data = df[df['query'] == query]
if len(query_data) > 0:
trend = query_data['trend_direction'].iloc[-1]
slope = query_data['trend_slope'].iloc[-1]
summary_parts.append(f"- {query}: {trend} (slope={slope:.3f})")
return "\n".join(summary_parts)
def _fallback_executive_summary(self, period_metrics: pd.DataFrame) -> str:
"""Generate fallback summary without AI."""
# Find leader
leader = period_metrics.loc[period_metrics['avg_share'].idxmax()]
summary = f"""Share of Search Analysis Summary
Market Leader: {leader['query']} with {leader['avg_share']:.1f}% average share.
The analysis covers {len(period_metrics)} brands/queries over the specified period. """
if len(period_metrics) > 1:
second = period_metrics.nlargest(2, 'avg_share').iloc[1]
summary += f"Second place: {second['query']} at {second['avg_share']:.1f}% share. "
summary += "\n\nFor detailed AI-powered insights, please configure an OpenAI API key."
return summary