#!/usr/bin/env python3 """ Comprehensive frequency distribution analysis """ import json import sys import math from collections import Counter from pathlib import Path def main(): json_path = Path("src/model/assets/pinyin_char_statistics.json") with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) pairs = data.get('pairs', {}) # Extract counts counts = [] for key, pair in pairs.items(): count = pair.get('count') if count is not None: counts.append(count) n = len(counts) print(f"Total entries: {n}") # Sort descending for rank-frequency analysis counts_sorted_desc = sorted(counts, reverse=True) # Basic statistics min_count = min(counts) max_count = max(counts) mean_count = sum(counts) / n # Percentiles percentiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99] print("\n=== PERCENTILE DISTRIBUTION ===") for p in percentiles: idx = int(p * n) value = counts_sorted_desc[idx] print(f"{p*100:5.1f}%: {value:>12} (rank ~{idx})") # Cumulative distribution print("\n=== CUMULATIVE DISTRIBUTION ===") thresholds = [1, 2, 3, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 5000000, 10000000, 50000000, 100000000, 500000000] for thresh in thresholds: if thresh > max_count: break below = sum(1 for c in counts if c <= thresh) above = sum(1 for c in counts if c >= thresh) print(f"Count <= {thresh:10}: {below:6} entries ({below/n*100:5.1f}%)") # print(f"Count >= {thresh:10}: {above:6} entries ({above/n*100:5.1f}%)") # Check min_count=109 parameter print("\n=== ANALYSIS OF THRESHOLD 109 ===") below_109 = sum(1 for c in counts if c < 109) at_or_above_109 = sum(1 for c in counts if c >= 109) print(f"Entries with count < 109: {below_109} ({below_109/n*100:.1f}%)") print(f"Entries with count >= 109: {at_or_above_109} ({at_or_above_109/n*100:.1f}%)") # If 109 is a threshold, what's the actual min among those >= 109? counts_ge_109 = [c for c in counts if c >= 109] if counts_ge_109: actual_min_ge_109 = min(counts_ge_109) print(f"Actual min frequency among those >= 109: {actual_min_ge_109}") # Rank-frequency analysis (Zipf's law) print("\n=== RANK-FREQUENCY ANALYSIS (Top 100) ===") print("Rank\tFrequency\tlog(rank)\tlog(freq)") for rank in range(1, 101): freq = counts_sorted_desc[rank-1] print(f"{rank}\t{freq}\t{math.log(rank):.3f}\t{math.log(freq):.3f}") # Frequency spectrum (how many distinct frequencies) freq_counter = Counter(counts) print(f"\n=== FREQUENCY SPECTRUM ===") print(f"Distinct frequency values: {len(freq_counter)}") # Most common frequencies print("\nTop 20 most common frequencies (plateau sizes):") for freq, freq_count in freq_counter.most_common(20): print(f" Frequency {freq}: {freq_count} entries") # Analyze ID ranges print("\n=== ID RANGE ANALYSIS ===") # Build ID to count mapping id_to_count = {} for key, pair in pairs.items(): char_id = pair.get('id') count = pair.get('count') if char_id is not None and count is not None: id_to_count[char_id] = count ranges = [ (0, 100, "Top 100 IDs"), (100, 500, "IDs 100-500"), (500, 1000, "IDs 500-1000"), (1000, 2000, "IDs 1000-2000"), (2000, 5000, "IDs 2000-5000"), (5000, 5500, "IDs 5000-5500 (user mentioned)"), (5500, 6000, "IDs 5500-6000"), (10000, 10500, "IDs 10000-10500"), (15000, 15500, "IDs 15000-15500"), (19000, 19500, "IDs 19000-19500 (before freq=1)"), (19499, 20647, "IDs with freq=1"), ] for start, end, label in ranges: range_counts = [id_to_count[id] for id in range(start, end) if id in id_to_count] if range_counts: min_c = min(range_counts) max_c = max(range_counts) mean_c = sum(range_counts) / len(range_counts) median_c = sorted(range_counts)[len(range_counts)//2] print(f"{label} ({len(range_counts)} entries): min={min_c}, max={max_c}, mean={mean_c:.1f}, median={median_c}") # Check if IDs are perfectly sorted by frequency print("\n=== ID ORDER VERIFICATION ===") all_ids = sorted(id_to_count.keys()) all_counts = [id_to_count[id] for id in all_ids] # Check for any violations of non-increasing order violations = 0 for i in range(1, len(all_counts)): if all_counts[i] > all_counts[i-1]: violations += 1 if violations <= 5: print(f"Violation at ID {all_ids[i]}: {all_counts[i]} > {all_counts[i-1]} (ID {all_ids[i-1]})") print(f"Total violations of non-increasing order: {violations}") # Check if equal frequencies are grouped together print("\n=== FREQUENCY GROUPING ANALYSIS ===") current_freq = None group_start = None group_sizes = [] for i, (id, count) in enumerate(zip(all_ids, all_counts)): if count != current_freq: if current_freq is not None: group_sizes.append((current_freq, group_start, all_ids[i-1], i - group_start)) current_freq = count group_start = i # Last group if current_freq is not None: group_sizes.append((current_freq, group_start, all_ids[-1], len(all_ids) - group_start)) # Sort groups by size group_sizes.sort(key=lambda x: x[3], reverse=True) print("Top 10 largest frequency groups (plateaus):") for freq, start_id_idx, end_id, size in group_sizes[:10]: start_id = all_ids[start_id_idx] print(f" Frequency {freq}: IDs {start_id}-{end_id} ({size} entries)") # Summary for smoothing algorithm print("\n=== SMOOTHING ALGORITHM IMPLICATIONS ===") print("1. IDs are perfectly sorted by frequency (non-increasing).") print(f"2. Frequency range: {min_count} to {max_count} (ratio {max_count/min_count:.1e}:1).") print(f"3. {below_109} entries ({below_109/n*100:.1f}%) have frequency < 109.") print(f"4. Median frequency: {counts_sorted_desc[n//2]}.") print(f"5. 90% of entries have frequency <= {counts_sorted_desc[int(0.9*n)]}.") print(f"6. Top 1% of entries have frequency >= {counts_sorted_desc[int(0.01*n)]}.") print("7. Large frequency plateaus exist (many IDs share same frequency).") print("8. Smoothing should handle extreme frequency ratios (1:5e8).") # Save data for plotting with open("rank_freq.csv", "w") as f: f.write("rank,frequency\n") for rank, freq in enumerate(counts_sorted_desc, 1): f.write(f"{rank},{freq}\n") print("\nRank-frequency data saved to rank_freq.csv") if __name__ == "__main__": main()