#!/usr/bin/env python3 """ Comprehensive frequency distribution analysis """ import json import sys import math from collections import Counter from pathlib import Path def main(): json_path = ( Path(__file__).parent.parent / "src" / "model" / "assets" / "pinyin_char_statistics.json" ) with open(json_path, "r", encoding="utf-8") as f: data = json.load(f) pairs = data.get("pairs", {}) # Extract counts counts = [] for key, pair in pairs.items(): count = pair.get("count") if count is not None: counts.append(count) n = len(counts) print(f"Total entries: {n}") # Sort descending for rank-frequency analysis counts_sorted_desc = sorted(counts, reverse=True) # Basic statistics min_count = min(counts) max_count = max(counts) mean_count = sum(counts) / n # Percentiles percentiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99] print("\n=== PERCENTILE DISTRIBUTION ===") for p in percentiles: idx = int(p * n) value = counts_sorted_desc[idx] print(f"{p * 100:5.1f}%: {value:>12} (rank ~{idx})") # Cumulative distribution print("\n=== CUMULATIVE DISTRIBUTION ===") thresholds = [ 1, 2, 3, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 5000000, 10000000, 50000000, 100000000, 500000000, ] for thresh in thresholds: if thresh > max_count: break below = sum(1 for c in counts if c <= thresh) above = sum(1 for c in counts if c >= thresh) print(f"Count <= {thresh:10}: {below:6} entries ({below / n * 100:5.1f}%)") # print(f"Count >= {thresh:10}: {above:6} entries ({above/n*100:5.1f}%)") # Check min_count=109 parameter print("\n=== ANALYSIS OF THRESHOLD 109 ===") below_109 = sum(1 for c in counts if c < 109) at_or_above_109 = sum(1 for c in counts if c >= 109) print(f"Entries with count < 109: {below_109} ({below_109 / n * 100:.1f}%)") print( f"Entries with count >= 109: {at_or_above_109} ({at_or_above_109 / n * 100:.1f}%)" ) # If 109 is a threshold, what's the actual min among those >= 109? counts_ge_109 = [c for c in counts if c >= 109] if counts_ge_109: actual_min_ge_109 = min(counts_ge_109) print(f"Actual min frequency among those >= 109: {actual_min_ge_109}") # Rank-frequency analysis (Zipf's law) print("\n=== RANK-FREQUENCY ANALYSIS (Top 100) ===") print("Rank\tFrequency\tlog(rank)\tlog(freq)") for rank in range(1, 101): freq = counts_sorted_desc[rank - 1] print(f"{rank}\t{freq}\t{math.log(rank):.3f}\t{math.log(freq):.3f}") # Frequency spectrum (how many distinct frequencies) freq_counter = Counter(counts) print(f"\n=== FREQUENCY SPECTRUM ===") print(f"Distinct frequency values: {len(freq_counter)}") # Most common frequencies print("\nTop 20 most common frequencies (plateau sizes):") for freq, freq_count in freq_counter.most_common(20): print(f" Frequency {freq}: {freq_count} entries") # Analyze ID ranges print("\n=== ID RANGE ANALYSIS ===") # Build ID to count mapping id_to_count = {} for key, pair in pairs.items(): char_id = pair.get("id") count = pair.get("count") if char_id is not None and count is not None: id_to_count[char_id] = count ranges = [ (0, 100, "Top 100 IDs"), (100, 500, "IDs 100-500"), (500, 1000, "IDs 500-1000"), (1000, 2000, "IDs 1000-2000"), (2000, 5000, "IDs 2000-5000"), (5000, 5500, "IDs 5000-5500 (user mentioned)"), (5500, 6000, "IDs 5500-6000"), (10000, 10500, "IDs 10000-10500"), (15000, 15500, "IDs 15000-15500"), (19000, 19500, "IDs 19000-19500 (before freq=1)"), (19499, 20647, "IDs with freq=1"), ] for start, end, label in ranges: range_counts = [ id_to_count[id] for id in range(start, end) if id in id_to_count ] if range_counts: min_c = min(range_counts) max_c = max(range_counts) mean_c = sum(range_counts) / len(range_counts) median_c = sorted(range_counts)[len(range_counts) // 2] print( f"{label} ({len(range_counts)} entries): min={min_c}, max={max_c}, mean={mean_c:.1f}, median={median_c}" ) # Check if IDs are perfectly sorted by frequency print("\n=== ID ORDER VERIFICATION ===") all_ids = sorted(id_to_count.keys()) all_counts = [id_to_count[id] for id in all_ids] # Check for any violations of non-increasing order violations = 0 for i in range(1, len(all_counts)): if all_counts[i] > all_counts[i - 1]: violations += 1 if violations <= 5: print( f"Violation at ID {all_ids[i]}: {all_counts[i]} > {all_counts[i - 1]} (ID {all_ids[i - 1]})" ) print(f"Total violations of non-increasing order: {violations}") # Check if equal frequencies are grouped together print("\n=== FREQUENCY GROUPING ANALYSIS ===") current_freq = None group_start = None group_sizes = [] for i, (id, count) in enumerate(zip(all_ids, all_counts)): if count != current_freq: if current_freq is not None: group_sizes.append( (current_freq, group_start, all_ids[i - 1], i - group_start) ) current_freq = count group_start = i # Last group if current_freq is not None: group_sizes.append( (current_freq, group_start, all_ids[-1], len(all_ids) - group_start) ) # Sort groups by size group_sizes.sort(key=lambda x: x[3], reverse=True) print("Top 10 largest frequency groups (plateaus):") for freq, start_id_idx, end_id, size in group_sizes[:10]: start_id = all_ids[start_id_idx] print(f" Frequency {freq}: IDs {start_id}-{end_id} ({size} entries)") # Summary for smoothing algorithm print("\n=== SMOOTHING ALGORITHM IMPLICATIONS ===") print("1. IDs are perfectly sorted by frequency (non-increasing).") print( f"2. Frequency range: {min_count} to {max_count} (ratio {max_count / min_count:.1e}:1)." ) print(f"3. {below_109} entries ({below_109 / n * 100:.1f}%) have frequency < 109.") print(f"4. Median frequency: {counts_sorted_desc[n // 2]}.") print(f"5. 90% of entries have frequency <= {counts_sorted_desc[int(0.9 * n)]}.") print( f"6. Top 1% of entries have frequency >= {counts_sorted_desc[int(0.01 * n)]}." ) print("7. Large frequency plateaus exist (many IDs share same frequency).") print("8. Smoothing should handle extreme frequency ratios (1:5e8).") # Save data for plotting with open("rank_freq.csv", "w") as f: f.write("rank,frequency\n") for rank, freq in enumerate(counts_sorted_desc, 1): f.write(f"{rank},{freq}\n") print("\nRank-frequency data saved to rank_freq.csv") if __name__ == "__main__": main()