SUimeModelTraner/comprehensive_analysis.py

178 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive frequency distribution analysis
"""
import json
import sys
import math
from collections import Counter
from pathlib import Path
def main():
json_path = Path("src/model/assets/pinyin_char_statistics.json")
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
pairs = data.get('pairs', {})
# Extract counts
counts = []
for key, pair in pairs.items():
count = pair.get('count')
if count is not None:
counts.append(count)
n = len(counts)
print(f"Total entries: {n}")
# Sort descending for rank-frequency analysis
counts_sorted_desc = sorted(counts, reverse=True)
# Basic statistics
min_count = min(counts)
max_count = max(counts)
mean_count = sum(counts) / n
# Percentiles
percentiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
print("\n=== PERCENTILE DISTRIBUTION ===")
for p in percentiles:
idx = int(p * n)
value = counts_sorted_desc[idx]
print(f"{p*100:5.1f}%: {value:>12} (rank ~{idx})")
# Cumulative distribution
print("\n=== CUMULATIVE DISTRIBUTION ===")
thresholds = [1, 2, 3, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 5000000, 10000000, 50000000, 100000000, 500000000]
for thresh in thresholds:
if thresh > max_count:
break
below = sum(1 for c in counts if c <= thresh)
above = sum(1 for c in counts if c >= thresh)
print(f"Count <= {thresh:10}: {below:6} entries ({below/n*100:5.1f}%)")
# print(f"Count >= {thresh:10}: {above:6} entries ({above/n*100:5.1f}%)")
# Check min_count=109 parameter
print("\n=== ANALYSIS OF THRESHOLD 109 ===")
below_109 = sum(1 for c in counts if c < 109)
at_or_above_109 = sum(1 for c in counts if c >= 109)
print(f"Entries with count < 109: {below_109} ({below_109/n*100:.1f}%)")
print(f"Entries with count >= 109: {at_or_above_109} ({at_or_above_109/n*100:.1f}%)")
# If 109 is a threshold, what's the actual min among those >= 109?
counts_ge_109 = [c for c in counts if c >= 109]
if counts_ge_109:
actual_min_ge_109 = min(counts_ge_109)
print(f"Actual min frequency among those >= 109: {actual_min_ge_109}")
# Rank-frequency analysis (Zipf's law)
print("\n=== RANK-FREQUENCY ANALYSIS (Top 100) ===")
print("Rank\tFrequency\tlog(rank)\tlog(freq)")
for rank in range(1, 101):
freq = counts_sorted_desc[rank-1]
print(f"{rank}\t{freq}\t{math.log(rank):.3f}\t{math.log(freq):.3f}")
# Frequency spectrum (how many distinct frequencies)
freq_counter = Counter(counts)
print(f"\n=== FREQUENCY SPECTRUM ===")
print(f"Distinct frequency values: {len(freq_counter)}")
# Most common frequencies
print("\nTop 20 most common frequencies (plateau sizes):")
for freq, freq_count in freq_counter.most_common(20):
print(f" Frequency {freq}: {freq_count} entries")
# Analyze ID ranges
print("\n=== ID RANGE ANALYSIS ===")
# Build ID to count mapping
id_to_count = {}
for key, pair in pairs.items():
char_id = pair.get('id')
count = pair.get('count')
if char_id is not None and count is not None:
id_to_count[char_id] = count
ranges = [
(0, 100, "Top 100 IDs"),
(100, 500, "IDs 100-500"),
(500, 1000, "IDs 500-1000"),
(1000, 2000, "IDs 1000-2000"),
(2000, 5000, "IDs 2000-5000"),
(5000, 5500, "IDs 5000-5500 (user mentioned)"),
(5500, 6000, "IDs 5500-6000"),
(10000, 10500, "IDs 10000-10500"),
(15000, 15500, "IDs 15000-15500"),
(19000, 19500, "IDs 19000-19500 (before freq=1)"),
(19499, 20647, "IDs with freq=1"),
]
for start, end, label in ranges:
range_counts = [id_to_count[id] for id in range(start, end) if id in id_to_count]
if range_counts:
min_c = min(range_counts)
max_c = max(range_counts)
mean_c = sum(range_counts) / len(range_counts)
median_c = sorted(range_counts)[len(range_counts)//2]
print(f"{label} ({len(range_counts)} entries): min={min_c}, max={max_c}, mean={mean_c:.1f}, median={median_c}")
# Check if IDs are perfectly sorted by frequency
print("\n=== ID ORDER VERIFICATION ===")
all_ids = sorted(id_to_count.keys())
all_counts = [id_to_count[id] for id in all_ids]
# Check for any violations of non-increasing order
violations = 0
for i in range(1, len(all_counts)):
if all_counts[i] > all_counts[i-1]:
violations += 1
if violations <= 5:
print(f"Violation at ID {all_ids[i]}: {all_counts[i]} > {all_counts[i-1]} (ID {all_ids[i-1]})")
print(f"Total violations of non-increasing order: {violations}")
# Check if equal frequencies are grouped together
print("\n=== FREQUENCY GROUPING ANALYSIS ===")
current_freq = None
group_start = None
group_sizes = []
for i, (id, count) in enumerate(zip(all_ids, all_counts)):
if count != current_freq:
if current_freq is not None:
group_sizes.append((current_freq, group_start, all_ids[i-1], i - group_start))
current_freq = count
group_start = i
# Last group
if current_freq is not None:
group_sizes.append((current_freq, group_start, all_ids[-1], len(all_ids) - group_start))
# Sort groups by size
group_sizes.sort(key=lambda x: x[3], reverse=True)
print("Top 10 largest frequency groups (plateaus):")
for freq, start_id_idx, end_id, size in group_sizes[:10]:
start_id = all_ids[start_id_idx]
print(f" Frequency {freq}: IDs {start_id}-{end_id} ({size} entries)")
# Summary for smoothing algorithm
print("\n=== SMOOTHING ALGORITHM IMPLICATIONS ===")
print("1. IDs are perfectly sorted by frequency (non-increasing).")
print(f"2. Frequency range: {min_count} to {max_count} (ratio {max_count/min_count:.1e}:1).")
print(f"3. {below_109} entries ({below_109/n*100:.1f}%) have frequency < 109.")
print(f"4. Median frequency: {counts_sorted_desc[n//2]}.")
print(f"5. 90% of entries have frequency <= {counts_sorted_desc[int(0.9*n)]}.")
print(f"6. Top 1% of entries have frequency >= {counts_sorted_desc[int(0.01*n)]}.")
print("7. Large frequency plateaus exist (many IDs share same frequency).")
print("8. Smoothing should handle extreme frequency ratios (1:5e8).")
# Save data for plotting
with open("rank_freq.csv", "w") as f:
f.write("rank,frequency\n")
for rank, freq in enumerate(counts_sorted_desc, 1):
f.write(f"{rank},{freq}\n")
print("\nRank-frequency data saved to rank_freq.csv")
if __name__ == "__main__":
main()