SUimeModelTraner/analyze_frequency.py

#!/usr/bin/env python3
"""
Analyze frequency distribution in pinyin_char_statistics.json
"""

import json
import sys
import os
import math
from collections import defaultdict
from pathlib import Path

def main():
    # Path to the JSON file
    json_path = Path("src/model/assets/pinyin_char_statistics.json")
    if not json_path.exists():
        print(f"Error: File not found: {json_path}")
        sys.exit(1)

    print(f"Loading {json_path}...")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print(f"Timestamp: {data.get('timestamp')}")
    print(f"Total characters: {data.get('total_characters')}")
    print(f"Total pinyins: {data.get('total_pinyins')}")
    print(f"Valid input character count: {data.get('valid_input_character_count')}")

    pairs = data.get('pairs', {})
    print(f"Number of pairs: {len(pairs)}")

    # Extract counts and IDs
    counts = []
    id_to_count = {}
    char_to_count = {}
    for key, pair in pairs.items():
        try:
            char_id = pair.get('id')
            count = pair.get('count')
            char = pair.get('char', '')
            if char_id is not None and count is not None:
                counts.append(count)
                id_to_count[char_id] = count
                if char:
                    char_to_count[char] = count
        except (ValueError, TypeError) as e:
            print(f"Warning: Could not parse pair {key}: {e}")
            continue

    if not counts:
        print("No valid count data found.")
        return

    # Basic statistics
    min_count = min(counts)
    max_count = max(counts)
    total_count = sum(counts)
    mean_count = total_count / len(counts)

    # Sort counts for percentiles
    sorted_counts = sorted(counts)
    n = len(sorted_counts)

    # Percentiles
    p10 = sorted_counts[int(0.1 * n)]
    p25 = sorted_counts[int(0.25 * n)]
    p50 = sorted_counts[int(0.5 * n)]
    p75 = sorted_counts[int(0.75 * n)]
    p90 = sorted_counts[int(0.9 * n)]
    p99 = sorted_counts[int(0.99 * n)]

    # Variance and std dev
    variance = sum((x - mean_count) ** 2 for x in counts) / n
    std_dev = math.sqrt(variance)

    print("\n=== BASIC STATISTICS ===")
    print(f"Min frequency: {min_count}")
    print(f"Max frequency: {max_count}")
    print(f"Mean frequency: {mean_count:.2f}")
    print(f"Standard deviation: {std_dev:.2f}")
    print(f"Total frequency sum: {total_count}")
    print(f"Number of entries: {n}")

    print("\n=== PERCENTILES ===")
    print(f"10th percentile: {p10}")
    print(f"25th percentile: {p25}")
    print(f"50th percentile (median): {p50}")
    print(f"75th percentile: {p75}")
    print(f"90th percentile: {p90}")
    print(f"99th percentile: {p99}")

    # Find IDs with min and max counts
    min_ids = [id for id, count in id_to_count.items() if count == min_count]
    max_ids = [id for id, count in id_to_count.items() if count == max_count]

    print(f"\nIDs with min frequency ({min_count}): {min_ids}")
    print(f"IDs with max frequency ({max_count}): {max_ids}")

    # Check if IDs are assigned in frequency order
    # Compute correlation between ID and count
    ids = list(id_to_count.keys())
    id_counts = [id_to_count[id] for id in ids]

    # Sort by ID and check if counts are decreasing
    sorted_by_id = sorted(ids)
    counts_by_id = [id_to_count[id] for id in sorted_by_id]

    # Calculate monotonicity: count of times count decreases as ID increases
    decreases = 0
    increases = 0
    for i in range(1, len(counts_by_id)):
        if counts_by_id[i] < counts_by_id[i-1]:
            decreases += 1
        elif counts_by_id[i] > counts_by_id[i-1]:
            increases += 1

    print(f"\n=== ID ORDER ANALYSIS ===")
    print(f"Total pairs: {len(counts_by_id)}")
    print(f"Decreases as ID increases: {decreases} times")
    print(f"Increases as ID increases: {increases} times")
    print(f"Percentage decreasing: {decreases/(len(counts_by_id)-1)*100:.2f}%")

    # Check if IDs are roughly sorted by frequency
    # Compute Spearman rank correlation (simplified)
    sorted_by_count = sorted(ids, key=lambda x: id_to_count[x], reverse=True)
    rank_by_id = {id: i for i, id in enumerate(sorted_by_id)}
    rank_by_count = {id: i for i, id in enumerate(sorted_by_count)}

    # Average rank difference
    rank_diffs = [abs(rank_by_id[id] - rank_by_count[id]) for id in ids]
    avg_rank_diff = sum(rank_diffs) / len(rank_diffs)
    max_rank_diff = max(rank_diffs)

    print(f"Average rank difference between ID order and frequency order: {avg_rank_diff:.2f}")
    print(f"Maximum rank difference: {max_rank_diff}")

    # Analyze specific ID range 5000-5500
    print("\n=== ANALYSIS OF ID RANGE 5000-5500 ===")
    range_counts = []
    range_ids = []
    for id in range(5000, 5501):
        if id in id_to_count:
            range_counts.append(id_to_count[id])
            range_ids.append(id)

    if range_counts:
        range_min = min(range_counts)
        range_max = max(range_counts)
        range_mean = sum(range_counts) / len(range_counts)
        range_sorted = sorted(range_counts)
        range_n = len(range_counts)
        range_p10 = range_sorted[int(0.1 * range_n)] if range_n > 0 else 0
        range_p50 = range_sorted[int(0.5 * range_n)] if range_n > 0 else 0
        range_p90 = range_sorted[int(0.9 * range_n)] if range_n > 0 else 0

        print(f"IDs in range 5000-5500: {len(range_counts)}")
        print(f"Min frequency in range: {range_min}")
        print(f"Max frequency in range: {range_max}")
        print(f"Mean frequency in range: {range_mean:.2f}")
        print(f"10th percentile in range: {range_p10}")
        print(f"50th percentile in range: {range_p50}")
        print(f"90th percentile in range: {range_p90}")

        # Find IDs with min frequency in this range
        min_in_range_ids = [id for id in range_ids if id_to_count[id] == range_min]
        print(f"IDs with min frequency in range: {min_in_range_ids[:10]}{'...' if len(min_in_range_ids) > 10 else ''}")
    else:
        print("No IDs found in range 5000-5500")

    # Histogram of frequencies (log bins)
    print("\n=== FREQUENCY DISTRIBUTION (LOG BINS) ===")
    if max_count > 0:
        log_min = math.log10(min_count) if min_count > 0 else 0
        log_max = math.log10(max_count)
        num_bins = 20
        bin_edges = [10**(log_min + i*(log_max-log_min)/num_bins) for i in range(num_bins+1)]

        hist = [0] * num_bins
        for count in counts:
            if count > 0:
                log_val = math.log10(count)
                bin_idx = min(int((log_val - log_min) / (log_max - log_min) * num_bins), num_bins-1)
                hist[bin_idx] += 1

        print("Log-scale histogram (count range -> frequency count):")
        for i in range(num_bins):
            if hist[i] > 0:
                lower = bin_edges[i]
                upper = bin_edges[i+1]
                print(f"  {lower:.2e} - {upper:.2e}: {hist[i]} entries")

    # Check for zero or near-zero frequencies
    zero_count = sum(1 for c in counts if c == 0)
    low_count = sum(1 for c in counts if 0 < c <= 10)
    very_low_count = sum(1 for c in counts if 0 < c <= 100)

    print(f"\n=== LOW FREQUENCY ANALYSIS ===")
    print(f"Entries with zero frequency: {zero_count}")
    print(f"Entries with frequency <= 10: {low_count}")
    print(f"Entries with frequency <= 100: {very_low_count}")

    # Find the actual min frequency (excluding zeros if any)
    non_zero_counts = [c for c in counts if c > 0]
    if non_zero_counts:
        actual_min = min(non_zero_counts)
        print(f"Actual min frequency (non-zero): {actual_min}")
        actual_min_ids = [id for id, count in id_to_count.items() if count == actual_min]
        print(f"IDs with actual min frequency: {actual_min_ids[:10]}{'...' if len(actual_min_ids) > 10 else ''}")

    # Summary for smoothing algorithm design
    print("\n=== SUMMARY FOR SMOOTHING ALGORITHM DESIGN ===")
    print(f"Frequency range spans {max_count/min_count if min_count>0 else 'inf'}:1 ratio")
    print(f"Most entries ({p50}) have frequency around {p50}")
    print(f"Top 10% of entries have frequency > {p90}")
    print(f"Bottom 10% of entries have frequency < {p10}")
    print(f"ID order is {'roughly' if decreases > increases else 'not'} sorted by frequency")

    # Save detailed data for further analysis
    output_file = "frequency_analysis_results.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("Frequency Analysis Results\n")
        f.write("="*50 + "\n")
        f.write(f"Min frequency: {min_count}\n")
        f.write(f"Max frequency: {max_count}\n")
        f.write(f"Mean frequency: {mean_count:.2f}\n")
        f.write(f"Standard deviation: {std_dev:.2f}\n")
        f.write(f"10th percentile: {p10}\n")
        f.write(f"50th percentile: {p50}\n")
        f.write(f"90th percentile: {p90}\n")
        f.write(f"IDs in range 5000-5500 min: {range_min if 'range_min' in locals() else 'N/A'}\n")
        f.write(f"IDs in range 5000-5500 max: {range_max if 'range_max' in locals() else 'N/A'}\n")

    print(f"\nDetailed results saved to {output_file}")

if __name__ == "__main__":
    main()