"""
Binance API Interpreter.

Broker: Binance
Format: JSON (API sync)
Source: sync (API)
Assets: spot, futures

file_row Hash Formula (Legacy Compatibility):
---------------------------------------------
The file_row field is computed as an MD5 hash for deduplication against
the legacy TraderSync system. The formula is:

    file_row = MD5(json.dumps(order_with_pre_hash_fields))

Steps:
    1. Start with raw Binance API order (preserving original key order)
    2. Add 4 pre-hash fields at the end:
       - binance_type = 'FUTURE' or 'SPOT' (uppercase)
       - category = 'Future' or 'Spot' (title case)
       - created_at = time / 1000 (seconds as float)
       - created_at_formated = datetime.utcfromtimestamp(created_at).strftime('%Y-%m-%d %H:%M:%S')
    3. Hash: hashlib.md5(json.dumps(order).encode('utf-8')).hexdigest()

Fields NOT included in hash (added post-hash by legacy system):
    date_tz, type_stock, type_option, action, shares, comm, njson, decimal,
    expire, strike, pip_value, original_file_row, broker, userid, portfolio, archive

Match Rate: 100% against legacy data (verified with 26,182 records).
Important: Key order matters - JSON keys must preserve original API order.
"""

import polars as pl
import json
import hashlib
from typing import ClassVar, Set, List, Dict, Any
from datetime import datetime
import logging

from pipeline.p01_normalize.base import BaseInterpreter

logger = logging.getLogger(__name__)


class BinanceInterpreter(BaseInterpreter):
    """
    Interpreter for Binance API JSON format.

    Handles JSON data from Binance API with orders array.
    Supports both spot and futures categories with different field names.

    Spot format uses: isBuyer (boolean)
    Futures format uses: side (string BUY/SELL)
    """

    BROKER_ID: ClassVar[str] = "binance"
    FORMAT_VERSION: ClassVar[str] = "1.0"
    SUPPORTED_ASSETS: ClassVar[Set[str]] = {"spot", "futures"}

    # Category to asset mapping (lowercase values for output schema)
    # NOTE: For options (CALL/PUT), grouping logic needs additional documentation
    # as options have more complex grouping requirements using option_strike and option_expire
    CATEGORY_MAP: ClassVar[dict] = {
        "spot": "crypto",
        "futures": "crypto",
    }

    @classmethod
    def can_handle(cls, df: pl.DataFrame, metadata: dict) -> bool:
        """
        Check if this interpreter can handle the data.

        For Binance, we check for specific columns from flattened JSON.
        """
        # Check for common fields across spot and futures
        required = {"id", "symbol", "price", "qty", "time", "commission"}
        return required.issubset(set(df.columns))

    @classmethod
    def get_priority(cls) -> int:
        """Higher priority for Binance format."""
        return 100

    # Category to legacy category name mapping
    LEGACY_CATEGORY_MAP: ClassVar[dict] = {
        "spot": ("SPOT", "Spot"),      # (binance_type, category)
        "futures": ("FUTURE", "Future"),
    }

    @classmethod
    def parse_json_content(cls, json_content: str, category: str = "spot") -> pl.DataFrame:
        """
        Parse Binance JSON content to DataFrame.

        Args:
            json_content: Raw JSON string (dict with 'orders' key)
            category: Trading category (spot, futures) - overridden by data if present

        Returns:
            DataFrame with order data
        """
        data = json.loads(json_content)

        # Data is dict with orders array
        if not isinstance(data, dict):
            logger.warning("Expected dict with orders key")
            return pl.DataFrame()

        # Use category from data if available
        data_category = data.get("category", category)
        orders = data.get("orders", [])

        if not orders:
            logger.warning("No orders found in JSON")
            return pl.DataFrame()

        # Track row_index to preserve original file order for position calculation
        records = []
        for row_idx, order in enumerate(orders):
            # Determine buy/sell based on format
            # Spot uses isBuyer (boolean), Futures uses side (string) or buyer (boolean)
            if "side" in order:
                # Futures format with side string
                is_buy = order.get("side", "").upper() == "BUY"
            elif "isBuyer" in order:
                # Spot format with isBuyer boolean
                is_buy = order.get("isBuyer", False)
            elif "buyer" in order:
                # Alternative format with buyer boolean
                is_buy = order.get("buyer", False)
            else:
                # Default to buy if can't determine
                is_buy = True

            # Compute file_row hash using legacy formula:
            # 1. Start with raw order (preserving original key order)
            # 2. Add 4 pre-hash fields: binance_type, category, created_at, created_at_formated
            # 3. Hash: MD5(json.dumps(order))

            # Get time and compute created_at fields
            order_time = order.get("time", 0)
            try:
                time_ms = int(order_time)
                created_at = time_ms / 1000  # Convert ms to seconds (as float)
                created_at_formated = datetime.utcfromtimestamp(created_at).strftime('%Y-%m-%d %H:%M:%S')
            except (ValueError, OSError, OverflowError):
                created_at = 0
                created_at_formated = ""

            # Legacy category names (binance_type is uppercase, category is title case)
            binance_type, legacy_category = cls.LEGACY_CATEGORY_MAP.get(
                data_category.lower(), ("FUTURE", "Future")
            )

            # Build order with pre-hash fields (preserving original key order)
            order_for_hash = dict(order)  # Preserve original key order
            order_for_hash["binance_type"] = binance_type
            order_for_hash["category"] = legacy_category
            order_for_hash["created_at"] = created_at
            order_for_hash["created_at_formated"] = created_at_formated

            # Hash the JSON (no sort_keys to preserve original order)
            file_row_hash = hashlib.md5(json.dumps(order_for_hash).encode('utf-8')).hexdigest()

            record = {
                "id": str(order.get("id", "")),
                "symbol": str(order.get("symbol", "")),
                "orderId": str(order.get("orderId", "")),
                "side": "BUY" if is_buy else "SELL",
                "price": float(order.get("price", 0) or 0),
                "qty": float(order.get("qty", 0) or 0),
                "quoteQty": float(order.get("quoteQty", 0) or 0),
                "commission": float(order.get("commission", 0) or 0),
                "commissionAsset": str(order.get("commissionAsset", "USDT")),
                "time": str(order.get("time", "")),
                "status": str(order.get("status", "FILLED")),  # Order status for filtering
                "category": data_category,
                # Futures-specific fields
                "positionSide": str(order.get("positionSide", "")),
                "realizedPnl": float(order.get("realizedPnl", 0) or 0),
                # Store original for dedup - preserve original key order and format
                # Legacy uses standard JSON with spaces (not compact, not sorted)
                "_original_order": json.dumps(order),
                # Computed file_row hash
                "_file_row_hash": file_row_hash,
                # Row index for position calculation ordering (legacy compatibility)
                "_row_index": row_idx,
            }
            records.append(record)

        if not records:
            logger.warning("No orders found after processing")
            return pl.DataFrame()

        return pl.DataFrame(records)

    def _build_file_row_expr_json(self) -> pl.Expr:
        """
        Build original_file_row from the stored original order JSON.

        The original order is already stored as JSON string during parsing.
        NOTE: Preserving original case to match broker data exactly.
        This affects deduplication hash for new imports vs historical data.
        """
        return pl.col("_original_order")

    def normalize(self, df: pl.LazyFrame, user_id: int, account_id: str = "") -> pl.LazyFrame:
        """
        Transform Binance data to normalized schema for grouping.

        Args:
            df: Input data as LazyFrame (from parse_json_content)
            user_id: TraderSync user ID
            account_id: Account ID from input metadata

        Returns:
            Normalized LazyFrame matching grouping.py expected schema
        """
        return (
            df
            # Build original_file_row
            .with_columns([
                self._build_file_row_expr_json().alias("original_file_row")
            ])
            # Apply transformations
            .with_columns([
                # user_id
                pl.lit(user_id).alias("user_id"),

                # account_id - from input metadata (not in order data)
                pl.lit(account_id).alias("account_id"),

                # execution_id - use id (trade id)
                pl.col("id").alias("execution_id"),

                # symbol - uppercase
                pl.col("symbol").str.to_uppercase().str.strip_chars().alias("symbol"),

                # side - "BUY" or "SELL" string (already parsed in parse_json_content)
                pl.col("side").alias("side"),

                # quantity - qty
                pl.col("qty").alias("quantity"),

                # price
                pl.col("price").alias("price"),

                # timestamp - time is milliseconds since epoch
                pl.col("time").cast(pl.Int64).cast(pl.Datetime("ms")).alias("timestamp"),

                # commission (absolute value)
                pl.col("commission").abs().alias("commission"),

                # fees - not provided separately
                pl.lit(0.0).alias("fees"),

                # swap - not applicable
                pl.lit(0.0).alias("swap"),

                # currency - from commissionAsset, default to USDT
                pl.col("commissionAsset").alias("currency"),

                # asset - based on category (crypto for all Binance assets)
                pl.col("category")
                .replace_strict(self.CATEGORY_MAP, default="crypto")
                .alias("asset"),

                # option_strike - not applicable for spot/futures
                pl.lit(None).cast(pl.Float64).alias("option_strike"),

                # option_expire - not applicable
                pl.lit(None).alias("option_expire"),

                # multiplier - 1 for spot/futures
                pl.lit(1.0).alias("multiplier"),

                # pip_value
                pl.lit(1.0).alias("pip_value"),

                # file_row - MD5 hash computed from order with pre-hash fields
                pl.col("_file_row_hash").alias("file_row"),

                # row_index - original row order for position calculation
                pl.col("_row_index").alias("row_index"),
            ])
            # ============================================================
            # SYMBOL TRANSFORMATIONS
            # ============================================================
            # ⚠️ 10. COVER/SHORT mapping (before side validation)
            .with_columns([
                pl.col("side").str.replace("COVER", "BUY")
                              .str.replace("SHORT", "SELL")
                .alias("side")
            ])
            # ⚠️ 7. Chinese character filtering (币安人生) - Filter promotional trades
            .filter(~pl.col("symbol").str.contains("币安人生"))
            # ⚠️ 8. Fee scientific notation handling
            .with_columns([
                pl.when(pl.col("commission").cast(pl.Utf8).str.contains("e-|E-"))
                  .then(pl.col("commission").round(8))
                  .otherwise(pl.col("commission"))
                  .alias("commission")
            ])
            # ============================================================
            # CRITICAL DATA INTEGRITY VALIDATIONS - Filter invalid records
            # ============================================================
            .filter(pl.col("status") == "FILLED")  # ❌ 1. Only FILLED orders
            .filter(pl.col("quantity") > 0)  # ❌ 2. Quantity must be positive
            .filter(pl.col("price") > 0)  # ❌ 3. Price must be positive
            .filter(pl.col("symbol") != "")  # ❌ 4. Symbol cannot be empty
            .filter(pl.col("symbol").is_not_null())  # ❌ 4. Symbol cannot be null
            .filter(pl.col("timestamp").is_not_null())  # ❌ 5. Timestamp must be valid
            .filter(pl.col("side").is_in(["BUY", "SELL"]))  # ⚠️ 9. Side must be BUY or SELL
            # Select final columns in correct order (19 columns per schema + row_index)
            .select([
                "user_id",
                "account_id",
                "execution_id",
                "symbol",
                "side",
                "quantity",
                "price",
                "timestamp",
                "commission",
                "fees",
                "swap",
                "currency",
                "asset",
                "option_strike",
                "option_expire",
                "multiplier",
                "pip_value",
                "original_file_row",
                "file_row",
                "row_index",
            ])
        )
