# kods/py/1_prepare_data.py
import pandas as pd
import pickle
import time
import sys
import shutil
import sqlite3
import json
import re
import os
from pathlib import Path
from datetime import datetime

# --- KONFIGURĀCIJA ---
SCRIPT_DIR = Path(__file__).parent.parent.parent.resolve()
KODS_DIR = SCRIPT_DIR / 'kods'
CSV_DIR = SCRIPT_DIR / 'csv'
CACHE_DIR = KODS_DIR / 'cache'
NACE_DB_PATH = KODS_DIR / "nace_stats.sqlite"
SEARCH_DB_PATH = KODS_DIR / "companies.sqlite"

sys.path.append(str(KODS_DIR))

try:
    from processing import loader
except ImportError as e:
    print(f"KĻŪDA: {e}")
    sys.exit(1)

# Saraksts ar tabulām, kuras NEIELĀDĒT, lai taupītu RAM un CPU.
EXCLUDED_TABLES = {
    'officers',
    'beneficial_owners',
    'stockholders',
    'members',
    'members_joint_owners',
    'stockholders_joint_owners'
}

# --- MEKLĒŠANAS FUNKCIJAS ---
def normalize_for_search(name_str):
    """Normalizē nosaukumu meklēšanai (noņem SIA, pēdiņas, utt)."""
    if not name_str: return ""
    
    # 1. Pamatattīrīšana
    text = str(name_str).lower()
    text = text.replace('\xa0', ' ').replace('\u200b', ' ')
    text = re.sub(r'[„”""\'`\(\)\[\]\{\}:;,./\\|\-]', ' ', text)
    text_after_punctuation = re.sub(r'\s+', ' ', text).strip()

    # 2. Stopvārdi (Juridiskās formas)
    common_prefixes = [
        "ik", "sia", "as", "ps", "ks", "zs", "nodibinājums", "biedrība",
        "individuālais komersants", "sabiedrība ar ierobežotu atbildību",
        "akciju sabiedrība", "pilnsabiedrība", "komandītsabiedrība",
        "zemnieku saimniecība", "kooperatīvā sabiedrība", "filiāle",
        "ārvalsts komersanta filiāle", "individuālais uzņēmums",
        "pašvaldības uzņēmums", "pārstāvniecība", "fonds"
    ]

    words = text_after_punctuation.split()
    cleaned_words = [word for word in words if word not in common_prefixes]
    text = " ".join(cleaned_words)
    text = re.sub(r'\s+', ' ', text).strip()

    # 3. Unikālās daļas
    unique_parts = set()
    if text: unique_parts.add(text)
    
    text_no_space = text.replace(' ', '')
    if text_no_space and text_no_space != text:
        unique_parts.add(text_no_space)

    return " ".join(sorted(list(unique_parts)))

def build_search_database(dataframes):
    """Izveido SQLite datubāzi priekš Live Search."""
    print("   -> Ģenerē Meklēšanas datubāzi (companies.sqlite)...")
    
    if SEARCH_DB_PATH.exists():
        try: SEARCH_DB_PATH.unlink()
        except: pass

    conn = sqlite3.connect(SEARCH_DB_PATH)
    cursor = conn.cursor()

    # Tabulu struktūra
    cursor.execute('CREATE TABLE IF NOT EXISTS companies (regcode TEXT PRIMARY KEY, original_name TEXT NOT NULL, search_helper TEXT NOT NULL)')
    cursor.execute('CREATE VIRTUAL TABLE companies_fts USING fts5(search_helper_content, content="companies", content_rowid="rowid", tokenize = "unicode61 remove_diacritics 2")')
    cursor.execute('CREATE TRIGGER companies_ai AFTER INSERT ON companies BEGIN INSERT INTO companies_fts (rowid, search_helper_content) VALUES (new.rowid, new.search_helper); END;')
    cursor.execute('CREATE TRIGGER companies_ad AFTER DELETE ON companies BEGIN DELETE FROM companies_fts WHERE rowid=old.rowid; END;')
    cursor.execute('CREATE TRIGGER companies_au AFTER UPDATE ON companies BEGIN UPDATE companies_fts SET search_helper_content=new.search_helper WHERE rowid=old.rowid; END;')

    # Datu sagatavošana
    reg_df = dataframes['register']
    records = reg_df.to_dict('records')
    
    batch_data = []
    count = 0

    for row in records:
        regcode = str(row.get('regcode', '')).strip()
        
        # Vienkārša validācija
        if not (regcode.isdigit() and len(regcode) == 11): continue
        
        term_date = str(row.get('terminated', '')).strip()
        if term_date and term_date < '2020-01-01' and term_date != '0000-00-00': continue

        # Nosaukuma loģika
        type_val = str(row.get('type', '')).strip()
        name_in_quotes = str(row.get('name_in_quotes', '')).strip()
        name_before = str(row.get('name_before_quotes', '')).strip()
        original_name = str(row.get('name', '')).strip()

        display_name = original_name
        if type_val == 'ZEM':
            parts = [p for p in [type_val, name_in_quotes, name_before] if p]
            display_name = ", ".join(parts)
        elif type_val and name_in_quotes:
            display_name = f"{type_val}, {name_in_quotes}"
        elif name_in_quotes:
            display_name = name_in_quotes

        if not display_name: continue

        # Meklēšanas palīgs
        base_name = name_in_quotes or name_before or original_name
        normalized = normalize_for_search(base_name)
        
        if not normalized and display_name != base_name:
            normalized = normalize_for_search(display_name)
            
        search_helper = f"{regcode} {normalized}"
        
        batch_data.append((regcode, display_name, search_helper))
        count += 1

    # Masveida ievietošana (ātrāk)
    try:
        cursor.executemany('INSERT OR IGNORE INTO companies (regcode, original_name, search_helper) VALUES (?, ?, ?)', batch_data)
        conn.commit()
        print(f"      + Indeksēti {count} uzņēmumi meklēšanai")
    except Exception as e:
        print(f"      Kļūda veidojot meklēšanas DB: {e}")
    finally:
        conn.close()


# --- NACE FUNKCIJAS ---
def build_nace_database(dataframes, nace_df):
    """Ģenerē NACE statistikas DB, iekļaujot '0000' un labojot darbinieku skaitu."""
    print("   -> Ģenerē NACE statistikas datubāzi...")
    
    reg_df = dataframes['register']
    active_mask = ((~reg_df['closed'].isin(['L', 'R'])) & (reg_df['terminated'].isna() | reg_df['terminated'].isin(['', '0000-00-00'])))
    main_df = reg_df[active_mask][['regcode', 'name']].copy()
    
    # 1. VID Dati
    tax_df = dataframes.get('pdb_nm_komersantu_samaksato_nodoklu_kopsumas_odata')
    if tax_df is not None and not tax_df.empty:
        tax_df['year_int'] = pd.to_numeric(tax_df['Taksacijas_gads'], errors='coerce').fillna(0)
        tax_df.sort_values(by=['Registracijas_kods', 'year_int'], ascending=[True, False], inplace=True)
        latest_tax = tax_df.drop_duplicates(subset=['Registracijas_kods'], keep='first')
        latest_tax = latest_tax[['Registracijas_kods', 'Pamatdarbibas_NACE_kods', 'Videjais_nodarbinato_personu_skaits_cilv']]
        latest_tax.columns = ['regcode', 'nace_raw', 'employees_vid']
        # Konvertējam VID darbiniekus uz skaitli
        latest_tax['employees_vid'] = pd.to_numeric(latest_tax['employees_vid'], errors='coerce').fillna(0).astype(int)
        main_df = pd.merge(main_df, latest_tax, on='regcode', how='left')
    
    # 2. Finanšu dati
    fs_df = dataframes.get('financial_statements')
    inc_df = dataframes.get('income_statements')
    
    if fs_df is not None and inc_df is not None:
        fs_df['year'] = pd.to_numeric(fs_df['year'], errors='coerce').fillna(0)
        fs_df.sort_values(by=['legal_entity_registration_number', 'year'], ascending=[True, False], inplace=True)
        latest_fs = fs_df.drop_duplicates(subset=['legal_entity_registration_number'], keep='first')
        merged_fs = pd.merge(latest_fs, inc_df, left_on='id', right_on='statement_id', how='left')
        
        merged_fs['net_turnover'] = pd.to_numeric(merged_fs['net_turnover'], errors='coerce').fillna(0)
        merged_fs['net_income'] = pd.to_numeric(merged_fs['net_income'], errors='coerce').fillna(0)
        # !!! LABOJUMS: Ielasām arī darbiniekus no Gada Pārskata
        merged_fs['employees_fs'] = pd.to_numeric(merged_fs['employees'], errors='coerce').fillna(0).astype(int)
        
        def get_mult(x):
            x = str(x).upper()
            return 1000 if 'THOUS' in x else (1000000 if 'MILL' in x else 1)
        
        merged_fs['mult'] = merged_fs['rounded_to_nearest'].apply(get_mult)
        merged_fs['turnover'] = merged_fs['net_turnover'] * merged_fs['mult']
        merged_fs['profit'] = merged_fs['net_income'] * merged_fs['mult']
        
        # Pievienojam employees_fs pie atlases
        main_fs = merged_fs[['legal_entity_registration_number', 'turnover', 'profit', 'employees_fs']]
        main_fs.columns = ['regcode', 'turnover', 'profit', 'employees_fs']
        main_df = pd.merge(main_df, main_fs, on='regcode', how='left')

    # --- JAUNĀ LOĢIKA NACE KODAM UN DARBINIEKIEM ---
    
    # 1. NACE kods
    main_df['nace_raw'] = main_df['nace_raw'].fillna('0000').replace('?', '0000')
    main_df['nace_code'] = main_df['nace_raw'].astype(str).str.replace('.', '', regex=False).str.strip()
    main_df.loc[main_df['nace_code'].isin(['nan', '', 'None', '0']), 'nace_code'] = '0000'
    
    # 2. Datu aizpildīšana
    main_df.fillna({'turnover': 0, 'profit': 0, 'employees_vid': 0, 'employees_fs': 0}, inplace=True)
    
    # !!! LABOJUMS: Apvienotais darbinieku skaits (max no VID un FS)
    # Tas nodrošina, ka, ja VID dati trūkst, bet Gada pārskatā ir, mēs redzam datus.
    main_df['employees_final'] = main_df[['employees_vid', 'employees_fs']].max(axis=1).astype(int)
    
    if NACE_DB_PATH.exists():
        try: NACE_DB_PATH.unlink() 
        except: pass
        
    conn = sqlite3.connect(NACE_DB_PATH)
    cursor = conn.cursor()
    cursor.execute("CREATE TABLE IF NOT EXISTS nace_stats (code TEXT PRIMARY KEY, json_data TEXT)")
    
    grouped = main_df.groupby('nace_code')
    count = 0
    base_meta = {"generated": datetime.now().strftime('%Y-%m-%d'), "unit": "EUR", "source": "Saraksts.lv"}

    batch_inserts = []
    for nace_code, group in grouped:
        if len(nace_code) < 2 and nace_code != '0000': continue
        
        # Filtrējam: iekļaujam, ja ir peļņa, apgrozījums VAI darbinieki (no jebkura avota)
        group_with_data = group[
            (group['profit'] != 0) | 
            (group['turnover'] != 0) | 
            (group['employees_final'] > 0)
        ]

        if group_with_data.empty:
            continue

        group_sorted = group_with_data.sort_values(by='profit', ascending=False)
        
        # !!! SVARĪGI: Sarakstā izmantojam 'employees_final'
        companies_list = group_sorted[['name', 'regcode', 'profit', 'turnover', 'employees_final']].values.tolist()
        
        output_data = {"meta": base_meta, "companies": companies_list, "nace_code": nace_code}
        batch_inserts.append((nace_code, json.dumps(output_data, ensure_ascii=False)))
        count += 1

    cursor.executemany("INSERT INTO nace_stats (code, json_data) VALUES (?, ?)", batch_inserts)
    conn.commit()
    conn.close()
    print(f"      + NACE DB gatava: {count} nozares (ar labotu darbinieku skaitu)")


def main():
    start_time = time.time()
    print("--- DATU SAGATAVOŠANA (PIPELINE) ---")

    if CACHE_DIR.exists(): shutil.rmtree(CACHE_DIR)
    CACHE_DIR.mkdir(parents=True, exist_ok=True)

    # 1. Ielāde
    print("1. Ielādē datus...")
    try:
        nace_df = pd.read_csv(KODS_DIR / 'NACE.csv', dtype={'Kods': str})
    except:
        print("   KĻŪDA: Nav NACE.csv")
        return

    if not CSV_DIR.exists():
        print(f"   KĻŪDA: Nav csv mapes")
        return

    # Filtrējam tabulas
    all_csv_files = [p.stem for p in CSV_DIR.glob('*.csv')]
    table_names = [name for name in all_csv_files if name not in EXCLUDED_TABLES]
    dataframes = loader.load_all_csv_data(CSV_DIR, table_names)

    if 'register' not in dataframes or dataframes['register'].empty:
        print("   KĻŪDA: register.csv tukšs")
        return

    # 2. Apstrāde
    print("2. Ģenerē datubāzes...")
    build_search_database(dataframes)
    build_nace_database(dataframes, nace_df)

    # 3. Kešatmiņa
    print("3. Saglabā Pickle failu...")
    data_bundle = {
        'nace_df': nace_df,
        'dataframes': dataframes,
        'generated_at': datetime.now().isoformat()
    }

    try:
        with open(CACHE_DIR / "data_bundle.pkl", 'wb') as f:
            pickle.dump(data_bundle, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"   -> Pabeigts {time.time() - start_time:.1f}s")
    except Exception as e:
        print(f"   Kļūda: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()