Source code for science_live.pipeline.nl_generator

"""
NaturalLanguageGenerator - Convert results back to natural language
"""

from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Any, Union, Tuple
from dataclasses import dataclass, field
from enum import Enum
import re
import asyncio
import logging
from datetime import datetime

# Import all required classes from common
from .common import (
    ProcessedResults,
    ProcessingContext,
    NaturalLanguageResult,
    RosettaStatement,
)

# ============================================================================
# NATURAL LANGUAGE GENERATOR
# ============================================================================


[docs]
class NaturalLanguageGenerator:
    """Convert results back to natural language"""
    
    def __init__(self, config: Dict[str, Any] = None):
        self.config = config or {}
        self.logger = logging.getLogger(self.__class__.__name__)
    

[docs]
    async def generate(self, processed_results: ProcessedResults, context: ProcessingContext) -> NaturalLanguageResult:
        """Generate natural language summary of results"""
        self.logger.info(f"Generating natural language for {processed_results.total_found} results")
        
        if processed_results.total_found == 0:
            return self._generate_no_results_response(context)
        
        # Generate summary
        summary = self._generate_summary(processed_results, context)
        
        # Generate detailed results
        detailed_results = await self._generate_detailed_results(processed_results)
        
        # Generate confidence explanation
        confidence_explanation = self._generate_confidence_explanation(processed_results)
        
        # Generate suggestions
        suggestions = self._generate_suggestions(processed_results, context)
        
        # Generate execution summary
        execution_summary = self._generate_execution_summary(context)
        
        result = NaturalLanguageResult(
            summary=summary,
            detailed_results=detailed_results,
            confidence_explanation=confidence_explanation,
            suggestions=suggestions,
            execution_summary=execution_summary
        )
        
        self.logger.info("Generated natural language response")
        return result

    

[docs]
    def _generate_summary(self, processed_results: ProcessedResults, context: ProcessingContext) -> str:
        """Generate high-level summary"""
        total = processed_results.total_found
        confidence = processed_results.processing_confidence
        
        # Analyze result types
        type_counts = {}
        for result_type, results in processed_results.groupings.get('by_type', {}).items():
            type_counts[result_type] = len(results)
        
        # Generate summary based on most common result type
        if 'citation' in type_counts and type_counts['citation'] > total * 0.5:
            summary = f"Found {total} citation relationships"
        elif 'rosetta_statement' in type_counts:
            summary = f"Found {total} scientific statements"
        else:
            summary = f"Found {total} relevant nanopublications"
        
        # Add confidence qualifier
        if confidence >= 0.8:
            summary += " with high confidence."
        elif confidence >= 0.5:
            summary += " with moderate confidence."
        else:
            summary += " with low confidence."
        
        return summary

    

[docs]
    async def _generate_detailed_results(self, processed_results: ProcessedResults) -> List[str]:
        """Generate detailed descriptions of individual results"""
        detailed = []
        
        # Limit to top results
        top_results = sorted(processed_results.results, key=lambda x: x.confidence, reverse=True)[:10]
        
        for i, result in enumerate(top_results, 1):
            if result.rosetta_statement and result.rosetta_statement.dynamic_label_template:
                # Use Rosetta statement's natural language representation
                description = self._rosetta_to_natural_language(result.rosetta_statement)
            else:
                # Fallback to generic description
                description = f"Nanopublication {result.nanopub_uri}"
            
            # Add confidence indicator
            if result.confidence >= 0.8:
                confidence_indicator = "✓"
            elif result.confidence >= 0.5:
                confidence_indicator = "~"
            else:
                confidence_indicator = "?"
            
            detailed.append(f"{i}. {confidence_indicator} {description}")
        
        return detailed

    

[docs]
    def _rosetta_to_natural_language(self, rosetta_statement: RosettaStatement) -> str:
        """Convert Rosetta statement to natural language"""
        template = rosetta_statement.dynamic_label_template
        if not template:
            return f"{rosetta_statement.subject.label or rosetta_statement.subject.text} {rosetta_statement.statement_type_label}"
        
        # Replace placeholders
        result = template.replace("SUBJECT", rosetta_statement.subject.label or rosetta_statement.subject.text)
        
        if rosetta_statement.required_object1:
            result = result.replace("OBJECT1", rosetta_statement.required_object1.label or rosetta_statement.required_object1.text)
        
        if rosetta_statement.optional_object1:
            result = result.replace("OBJECT2", rosetta_statement.optional_object1.label or rosetta_statement.optional_object1.text)
        
        # Clean up any remaining placeholders
        result = re.sub(r'(OBJECT[0-9]+|SUBJECT)', '', result)
        result = re.sub(r'\s+', ' ', result).strip()
        
        return result

    

[docs]
    def _generate_confidence_explanation(self, processed_results: ProcessedResults) -> str:
        """Explain the confidence level"""
        confidence = processed_results.processing_confidence
        
        if confidence >= 0.8:
            return "High confidence: Results closely match your query with well-structured data."
        elif confidence >= 0.6:
            return "Good confidence: Results are relevant with mostly complete information."
        elif confidence >= 0.4:
            return "Moderate confidence: Results may be relevant but information is incomplete."
        else:
            return "Low confidence: Results are uncertain and may not fully match your query."

    

[docs]
    def _generate_suggestions(self, processed_results: ProcessedResults, context: ProcessingContext) -> List[str]:
        """Generate suggestions for related queries"""
        suggestions = []
        
        # Analyze result patterns to suggest related queries
        if 'citation' in processed_results.groupings.get('by_type', {}):
            suggestions.append("Try searching for 'papers authored by [author name]'")
            suggestions.append("Search for 'recent citations of this work'")
        
        if processed_results.total_found > 50:
            suggestions.append("Add more specific terms to narrow your search")
        elif processed_results.total_found < 5:
            suggestions.append("Try broader terms or check spelling")
            suggestions.append("Use alternative phrasings of your question")
        
        # Add generic suggestions
        suggestions.extend([
            "Explore related concepts using 'what is related to [topic]'",
            "Find author information with ORCID: 'work by 0000-0000-0000-0000'"
        ])
        
        return suggestions[:5]

    

[docs]
    def _generate_execution_summary(self, context: ProcessingContext) -> Dict[str, Any]:
        """Generate summary of execution process"""
        total_time = context.get_elapsed_time()  # Use the context's elapsed time method
        
        return {
            'total_execution_time': round(total_time, 2),
            'query_processed': context.original_question,
            'pipeline_steps_completed': 7,
            'debug_mode': context.debug_mode
        }

           

[docs]
    def _generate_no_results_response(self, context: ProcessingContext) -> NaturalLanguageResult:
        """Generate response when no results found"""
            
        # Check if we have execution errors that might explain no results
        execution_errors = getattr(context, 'execution_errors', [])
        network_errors = [err for err in execution_errors if 'network' in str(err).lower() or 'connection' in str(err).lower()]
    
        if network_errors:
            # Network error caused no results
            summary = "Unable to search nanopublications due to network connectivity issues."
            confidence_explanation = "Could not connect to the nanopublication network to process your query."
            suggestions = [
                "Please try again in a few moments",
                "Check your internet connection",
                "The nanopublication servers may be temporarily unavailable",
                "Try a simpler query if the problem persists"
            ]
        else:
            # Normal no results case
            summary = "No results found for your query."
            confidence_explanation = "Unable to find matching information in the nanopub network."
            suggestions = [
                "Check spelling and try different terms",
                "Use more general concepts", 
                "Try asking about related topics",
                "Include specific identifiers like DOI or ORCID if available"
            ]
    
        return NaturalLanguageResult(
            summary=summary,
            detailed_results=[],
            confidence_explanation=confidence_explanation,
            suggestions=suggestions,
            execution_summary=self._generate_execution_summary(context)
        )