<?php
/**
 * Registry Manager
 * Manages word block and layout registries for PII detection
 */

namespace Redact\Classes;

class RegistryManager
{
    private array $wordRegistry = [];          // word_text => [all occurrences with positions]
    private array $layoutRegistry = [];        // layout_id => layout info + word blocks
    private array $processedWordBlocks = [];   // word_text => pii_info (knowledge base)
    
    private ?string $threadId = null;          // Current thread ID
    private ?string $cacheDir = null;          // Cache directory for this thread
    
    /**
     * Constructor
     *
     * @param string|null $threadId Thread ID for cache isolation
     * @param string|null $cacheDir Cache directory (auto-determined from threadId if not provided)
     */
    public function __construct(?string $threadId = null, ?string $cacheDir = null)
    {
        $this->threadId = $threadId;
        $this->cacheDir = $cacheDir;
        
        // Auto-load cached registry if thread ID provided
        if ($threadId && $cacheDir) {
            $this->loadCachedRegistry();
        }
    }
    
    /**
     * Load cached registry data from thread storage
     *
     * @return bool Success status
     */
    public function loadCachedRegistry(): bool
    {
        if (!$this->cacheDir || !is_dir($this->cacheDir)) {
            return false;
        }
        
        $cacheFile = $this->cacheDir . '/registry_cache.json';
        
        if (!file_exists($cacheFile)) {
            return false;
        }
        
        $data = file_get_contents($cacheFile);
        $cache = json_decode($data, true);
        
        if (!$cache || !isset($cache['processedWordBlocks'])) {
            return false;
        }
        
        $this->processedWordBlocks = $cache['processedWordBlocks'];
        
        return true;
    }
    
    /**
     * Save registry data to thread cache
     *
     * @return bool Success status
     */
    public function saveCachedRegistry(): bool
    {
        if (!$this->cacheDir) {
            return false;
        }
        
        // Ensure cache directory exists
        if (!is_dir($this->cacheDir)) {
            mkdir($this->cacheDir, 0755, true);
        }
        
        $cacheFile = $this->cacheDir . '/registry_cache.json';
        
        $cache = [
            'thread_id' => $this->threadId,
            'last_updated' => date('Y-m-d H:i:s'),
            'processedWordBlocks' => $this->processedWordBlocks,
            'statistics' => [
                'unique_pii_words' => count($this->processedWordBlocks),
                'total_pii_instances' => array_sum(array_map('count', $this->processedWordBlocks))
            ]
        ];
        
        $result = file_put_contents($cacheFile, json_encode($cache, JSON_PRETTY_PRINT));
        
        return $result !== false;
    }
    
    /**
     * Get cache statistics
     *
     * @return array Cache stats
     */
    public function getCacheStatistics(): array
    {
        if (!$this->cacheDir || !is_dir($this->cacheDir)) {
            return [
                'cache_exists' => false,
                'cached_words' => 0,
                'last_updated' => null
            ];
        }
        
        $cacheFile = $this->cacheDir . '/registry_cache.json';
        
        if (!file_exists($cacheFile)) {
            return [
                'cache_exists' => false,
                'cached_words' => 0,
                'last_updated' => null
            ];
        }
        
        $data = file_get_contents($cacheFile);
        $cache = json_decode($data, true);
        
        return [
            'cache_exists' => true,
            'cached_words' => count($cache['processedWordBlocks'] ?? []),
            'total_instances' => $cache['statistics']['total_pii_instances'] ?? 0,
            'last_updated' => $cache['last_updated'] ?? null,
            'thread_id' => $cache['thread_id'] ?? null
        ];
    }
    
    /**
     * Clear cached registry data
     *
     * @return bool Success status
     */
    public function clearCache(): bool
    {
        if (!$this->cacheDir || !is_dir($this->cacheDir)) {
            return false;
        }
        
        $cacheFile = $this->cacheDir . '/registry_cache.json';
        
        if (file_exists($cacheFile)) {
            unlink($cacheFile);
        }
        
        $this->processedWordBlocks = [];
        
        return true;
    }
    
    /**
     * Build registries from Textract blocks
     *
     * @param array $blocks Textract blocks
     * @param int $pageNum Page number
     * @return array Returns [layouts, words] for this page
     */
    public function buildRegistriesFromBlocks(array $blocks, int $pageNum): array
    {
        // Create block map for quick lookups
        $blockMap = [];
        foreach ($blocks as $block) {
            if (isset($block['Id'])) {
                $blockMap[$block['Id']] = $block;
            }
        }
        
        $pageLayouts = [];
        $pageWords = [];
        
        // Extract layouts and their child word blocks
        foreach ($blocks as $block) {
            $blockType = $block['BlockType'] ?? '';
            
            // Process layout blocks
            if (strpos($blockType, 'LAYOUT_') === 0 && isset($block['Geometry']['BoundingBox'])) {
                $layout = $this->extractLayout($block, $blockMap, $pageNum, count($pageLayouts));
                if ($layout) {
                    $pageLayouts[] = $layout;
                    $this->layoutRegistry[$layout['id']] = $layout;
                }
            }
            
            // Process word blocks
            if ($blockType === 'WORD' && isset($block['Text']) && isset($block['Geometry']['BoundingBox'])) {
                $wordBlock = $this->extractWordBlock($block, $pageNum);
                $pageWords[] = $wordBlock;
                
                $text = $wordBlock['text'];
                if (!isset($this->wordRegistry[$text])) {
                    $this->wordRegistry[$text] = [];
                }
                $this->wordRegistry[$text][] = $wordBlock;
            }
        }
        
        return [$pageLayouts, $pageWords];
    }
    
    /**
     * Extract layout information from block
     *
     * @param array $block Layout block from Textract
     * @param array $blockMap All blocks mapped by ID
     * @param int $pageNum Page number
     * @param int $layoutIndex Layout index on page
     * @return array|null Layout information
     */
    private function extractLayout(array $block, array $blockMap, int $pageNum, int $layoutIndex): ?array
    {
        $bbox = $block['Geometry']['BoundingBox'];
        $blockType = $block['BlockType'];
        $layoutId = "page{$pageNum}_layout{$layoutIndex}";
        
        // Get child word blocks for this layout
        // Textract hierarchy: LAYOUT → LINE → WORD (need to traverse 2 levels)
        $layoutWords = [];
        if (isset($block['Relationships'])) {
            foreach ($block['Relationships'] as $relationship) {
                if ($relationship['Type'] === 'CHILD') {
                    foreach ($relationship['Ids'] as $childId) {
                        $childBlock = $blockMap[$childId] ?? null;
                        if (!$childBlock) continue;
                        
                        // If child is a WORD, add it directly
                        if ($childBlock['BlockType'] === 'WORD' && isset($childBlock['Text'])) {
                            $layoutWords[] = mb_convert_encoding($childBlock['Text'], 'UTF-8', 'UTF-8');
                        }
                        // If child is a LINE, get its WORD children
                        elseif ($childBlock['BlockType'] === 'LINE' && isset($childBlock['Relationships'])) {
                            foreach ($childBlock['Relationships'] as $lineRel) {
                                if ($lineRel['Type'] === 'CHILD') {
                                    foreach ($lineRel['Ids'] as $wordId) {
                                        $wordBlock = $blockMap[$wordId] ?? null;
                                        if ($wordBlock && $wordBlock['BlockType'] === 'WORD' && isset($wordBlock['Text'])) {
                                            $layoutWords[] = mb_convert_encoding($wordBlock['Text'], 'UTF-8', 'UTF-8');
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        
        $layoutText = implode(' ', $layoutWords);
        
        return [
            'id' => $layoutId,
            'page' => $pageNum,
            'type' => $blockType,
            'confidence' => $block['Confidence'] ?? 1.0,
            'left' => $bbox['Left'],
            'top' => $bbox['Top'],
            'width' => $bbox['Width'],
            'height' => $bbox['Height'],
            'word_blocks' => $layoutWords,
            'text' => $layoutText,
            'word_count' => count($layoutWords)
        ];
    }
    
    /**
     * Extract word block information
     *
     * @param array $block Word block from Textract
     * @param int $pageNum Page number
     * @return array Word block information
     */
    private function extractWordBlock(array $block, int $pageNum): array
    {
        $text = mb_convert_encoding($block['Text'], 'UTF-8', 'UTF-8');
        $bbox = $block['Geometry']['BoundingBox'];
        
        return [
            'page' => $pageNum,
            'text' => $text,
            'confidence' => $block['Confidence'] ?? 1.0,
            'left' => $bbox['Left'],
            'top' => $bbox['Top'],
            'width' => $bbox['Width'],
            'height' => $bbox['Height']
        ];
    }
    
    /**
     * Determine which layouts need processing
     *
     * @return array Array of layout IDs that need processing
     */
    public function getLayoutsToProcess(): array
    {
        $layoutsToProcess = [];
        
        foreach ($this->layoutRegistry as $layoutId => $layout) {
            // Skip layouts with no text or no word blocks
            if (empty($layout['text']) || empty($layout['word_blocks'])) {
                continue;
            }
            
            // Check if all words in this layout have been processed
            $allWordsProcessed = true;
            foreach ($layout['word_blocks'] as $word) {
                if (!isset($this->processedWordBlocks[$word])) {
                    $allWordsProcessed = false;
                    break;
                }
            }
            
            if (!$allWordsProcessed) {
                $layoutsToProcess[] = $layoutId;
            }
        }
        
        return $layoutsToProcess;
    }
    
    /**
     * Map PII entities back to word blocks
     *
     * @param array $entities PII entities from Comprehend
     * @param string $layoutId Layout ID
     * @param string $layoutText Full layout text
     */
    public function mapPIIToWordBlocks(array $entities, string $layoutId, string $layoutText): void
    {
        $layout = $this->layoutRegistry[$layoutId] ?? null;
        if (!$layout) return;
        
        foreach ($entities as $entity) {
            $piiText = substr($layoutText, $entity['BeginOffset'], $entity['EndOffset'] - $entity['BeginOffset']);
            $piiText = trim($piiText);
            
            // Find which word(s) this PII corresponds to in the layout
            $currentOffset = 0;
            foreach ($layout['word_blocks'] as $word) {
                $wordStart = $currentOffset;
                $wordEnd = $currentOffset + strlen($word);
                
                // Check overlap
                if (($entity['BeginOffset'] >= $wordStart && $entity['BeginOffset'] < $wordEnd) ||
                    ($entity['EndOffset'] > $wordStart && $entity['EndOffset'] <= $wordEnd) ||
                    ($entity['BeginOffset'] <= $wordStart && $entity['EndOffset'] >= $wordEnd)) {
                    
                    // Mark this word as containing PII
                    if (!isset($this->processedWordBlocks[$word])) {
                        $this->processedWordBlocks[$word] = [];
                    }
                    
                    $this->processedWordBlocks[$word][] = [
                        'type' => $entity['Type'],
                        'score' => $entity['Score'],
                        'context' => substr($layoutText, max(0, $wordStart - 20), 60)
                    ];
                }
                
                $currentOffset = $wordEnd + 1; // +1 for space
            }
        }
    }
    
    /**
     * Apply PII to all occurrences via word registry
     *
     * @param array $pageData Page data with word_blocks
     * @return array PII blocks for this page
     */
    public function applyPIIToPage(array $pageData): array
    {
        $piiBlocks = [];
        
        foreach ($pageData['word_blocks'] as $wordBlock) {
            $word = $wordBlock['text'];
            
            if (isset($this->processedWordBlocks[$word])) {
                foreach ($this->processedWordBlocks[$word] as $piiInfo) {
                    $piiBlocks[] = [
                        'Type' => $piiInfo['type'],
                        'Text' => $word,
                        'Score' => $piiInfo['score'],
                        'Left' => $wordBlock['left'],
                        'Top' => $wordBlock['top'],
                        'Width' => $wordBlock['width'],
                        'Height' => $wordBlock['height'],
                        'Context' => $piiInfo['context'] ?? ''
                    ];
                }
            }
        }
        
        return $piiBlocks;
    }
    
    /**
     * Get layout by ID
     *
     * @param string $layoutId Layout ID
     * @return array|null Layout data
     */
    public function getLayout(string $layoutId): ?array
    {
        return $this->layoutRegistry[$layoutId] ?? null;
    }
    
    /**
     * Get statistics
     *
     * @return array Registry statistics
     */
    public function getStatistics(): array
    {
        return [
            'total_layouts' => count($this->layoutRegistry),
            'unique_words' => count($this->wordRegistry),
            'total_words' => array_sum(array_map('count', $this->wordRegistry)),
            'pii_words' => count($this->processedWordBlocks),
            'thread_id' => $this->threadId,
            'cache_enabled' => $this->cacheDir !== null
        ];
    }
    
    /**
     * Set thread context for caching
     *
     * @param string $threadId Thread ID
     * @param string $cacheDir Cache directory
     */
    public function setThreadContext(string $threadId, string $cacheDir): void
    {
        $this->threadId = $threadId;
        $this->cacheDir = $cacheDir;
        $this->loadCachedRegistry();
    }
}

