SUimeModelTraner/comprehensive_analysis.py

#!/usr/bin/env python3
"""
Comprehensive frequency distribution analysis
"""

import json
import sys
import math
from collections import Counter
from pathlib import Path

def main():
    json_path = Path("src/model/assets/pinyin_char_statistics.json")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    pairs = data.get('pairs', {})

    # Extract counts
    counts = []
    for key, pair in pairs.items():
        count = pair.get('count')
        if count is not None:
            counts.append(count)

    n = len(counts)
    print(f"Total entries: {n}")

    # Sort descending for rank-frequency analysis
    counts_sorted_desc = sorted(counts, reverse=True)

    # Basic statistics
    min_count = min(counts)
    max_count = max(counts)
    mean_count = sum(counts) / n

    # Percentiles
    percentiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
    print("\n=== PERCENTILE DISTRIBUTION ===")
    for p in percentiles:
        idx = int(p * n)
        value = counts_sorted_desc[idx]
        print(f"{p*100:5.1f}%: {value:>12} (rank ~{idx})")

    # Cumulative distribution
    print("\n=== CUMULATIVE DISTRIBUTION ===")
    thresholds = [1, 2, 3, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 5000000, 10000000, 50000000, 100000000, 500000000]
    for thresh in thresholds:
        if thresh > max_count:
            break
        below = sum(1 for c in counts if c <= thresh)
        above = sum(1 for c in counts if c >= thresh)
        print(f"Count <= {thresh:10}: {below:6} entries ({below/n*100:5.1f}%)")
        # print(f"Count >= {thresh:10}: {above:6} entries ({above/n*100:5.1f}%)")

    # Check min_count=109 parameter
    print("\n=== ANALYSIS OF THRESHOLD 109 ===")
    below_109 = sum(1 for c in counts if c < 109)
    at_or_above_109 = sum(1 for c in counts if c >= 109)
    print(f"Entries with count < 109: {below_109} ({below_109/n*100:.1f}%)")
    print(f"Entries with count >= 109: {at_or_above_109} ({at_or_above_109/n*100:.1f}%)")

    # If 109 is a threshold, what's the actual min among those >= 109?
    counts_ge_109 = [c for c in counts if c >= 109]
    if counts_ge_109:
        actual_min_ge_109 = min(counts_ge_109)
        print(f"Actual min frequency among those >= 109: {actual_min_ge_109}")

    # Rank-frequency analysis (Zipf's law)
    print("\n=== RANK-FREQUENCY ANALYSIS (Top 100) ===")
    print("Rank\tFrequency\tlog(rank)\tlog(freq)")
    for rank in range(1, 101):
        freq = counts_sorted_desc[rank-1]
        print(f"{rank}\t{freq}\t{math.log(rank):.3f}\t{math.log(freq):.3f}")

    # Frequency spectrum (how many distinct frequencies)
    freq_counter = Counter(counts)
    print(f"\n=== FREQUENCY SPECTRUM ===")
    print(f"Distinct frequency values: {len(freq_counter)}")

    # Most common frequencies
    print("\nTop 20 most common frequencies (plateau sizes):")
    for freq, freq_count in freq_counter.most_common(20):
        print(f"  Frequency {freq}: {freq_count} entries")

    # Analyze ID ranges
    print("\n=== ID RANGE ANALYSIS ===")
    # Build ID to count mapping
    id_to_count = {}
    for key, pair in pairs.items():
        char_id = pair.get('id')
        count = pair.get('count')
        if char_id is not None and count is not None:
            id_to_count[char_id] = count

    ranges = [
        (0, 100, "Top 100 IDs"),
        (100, 500, "IDs 100-500"),
        (500, 1000, "IDs 500-1000"),
        (1000, 2000, "IDs 1000-2000"),
        (2000, 5000, "IDs 2000-5000"),
        (5000, 5500, "IDs 5000-5500 (user mentioned)"),
        (5500, 6000, "IDs 5500-6000"),
        (10000, 10500, "IDs 10000-10500"),
        (15000, 15500, "IDs 15000-15500"),
        (19000, 19500, "IDs 19000-19500 (before freq=1)"),
        (19499, 20647, "IDs with freq=1"),
    ]

    for start, end, label in ranges:
        range_counts = [id_to_count[id] for id in range(start, end) if id in id_to_count]
        if range_counts:
            min_c = min(range_counts)
            max_c = max(range_counts)
            mean_c = sum(range_counts) / len(range_counts)
            median_c = sorted(range_counts)[len(range_counts)//2]
            print(f"{label} ({len(range_counts)} entries): min={min_c}, max={max_c}, mean={mean_c:.1f}, median={median_c}")

    # Check if IDs are perfectly sorted by frequency
    print("\n=== ID ORDER VERIFICATION ===")
    all_ids = sorted(id_to_count.keys())
    all_counts = [id_to_count[id] for id in all_ids]

    # Check for any violations of non-increasing order
    violations = 0
    for i in range(1, len(all_counts)):
        if all_counts[i] > all_counts[i-1]:
            violations += 1
            if violations <= 5:
                print(f"Violation at ID {all_ids[i]}: {all_counts[i]} > {all_counts[i-1]} (ID {all_ids[i-1]})")

    print(f"Total violations of non-increasing order: {violations}")

    # Check if equal frequencies are grouped together
    print("\n=== FREQUENCY GROUPING ANALYSIS ===")
    current_freq = None
    group_start = None
    group_sizes = []

    for i, (id, count) in enumerate(zip(all_ids, all_counts)):
        if count != current_freq:
            if current_freq is not None:
                group_sizes.append((current_freq, group_start, all_ids[i-1], i - group_start))
            current_freq = count
            group_start = i

    # Last group
    if current_freq is not None:
        group_sizes.append((current_freq, group_start, all_ids[-1], len(all_ids) - group_start))

    # Sort groups by size
    group_sizes.sort(key=lambda x: x[3], reverse=True)
    print("Top 10 largest frequency groups (plateaus):")
    for freq, start_id_idx, end_id, size in group_sizes[:10]:
        start_id = all_ids[start_id_idx]
        print(f"  Frequency {freq}: IDs {start_id}-{end_id} ({size} entries)")

    # Summary for smoothing algorithm
    print("\n=== SMOOTHING ALGORITHM IMPLICATIONS ===")
    print("1. IDs are perfectly sorted by frequency (non-increasing).")
    print(f"2. Frequency range: {min_count} to {max_count} (ratio {max_count/min_count:.1e}:1).")
    print(f"3. {below_109} entries ({below_109/n*100:.1f}%) have frequency < 109.")
    print(f"4. Median frequency: {counts_sorted_desc[n//2]}.")
    print(f"5. 90% of entries have frequency <= {counts_sorted_desc[int(0.9*n)]}.")
    print(f"6. Top 1% of entries have frequency >= {counts_sorted_desc[int(0.01*n)]}.")
    print("7. Large frequency plateaus exist (many IDs share same frequency).")
    print("8. Smoothing should handle extreme frequency ratios (1:5e8).")

    # Save data for plotting
    with open("rank_freq.csv", "w") as f:
        f.write("rank,frequency\n")
        for rank, freq in enumerate(counts_sorted_desc, 1):
            f.write(f"{rank},{freq}\n")
    print("\nRank-frequency data saved to rank_freq.csv")

if __name__ == "__main__":
    main()