SUimeModelTraner/scripts/comprehensive_analysis.py

228 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive frequency distribution analysis
"""
import json
import sys
import math
from collections import Counter
from pathlib import Path
def main():
json_path = (
Path(__file__).parent.parent
/ "src"
/ "model"
/ "assets"
/ "pinyin_char_statistics.json"
)
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
pairs = data.get("pairs", {})
# Extract counts
counts = []
for key, pair in pairs.items():
count = pair.get("count")
if count is not None:
counts.append(count)
n = len(counts)
print(f"Total entries: {n}")
# Sort descending for rank-frequency analysis
counts_sorted_desc = sorted(counts, reverse=True)
# Basic statistics
min_count = min(counts)
max_count = max(counts)
mean_count = sum(counts) / n
# Percentiles
percentiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
print("\n=== PERCENTILE DISTRIBUTION ===")
for p in percentiles:
idx = int(p * n)
value = counts_sorted_desc[idx]
print(f"{p * 100:5.1f}%: {value:>12} (rank ~{idx})")
# Cumulative distribution
print("\n=== CUMULATIVE DISTRIBUTION ===")
thresholds = [
1,
2,
3,
5,
10,
20,
50,
100,
200,
500,
1000,
2000,
5000,
10000,
20000,
50000,
100000,
200000,
500000,
1000000,
5000000,
10000000,
50000000,
100000000,
500000000,
]
for thresh in thresholds:
if thresh > max_count:
break
below = sum(1 for c in counts if c <= thresh)
above = sum(1 for c in counts if c >= thresh)
print(f"Count <= {thresh:10}: {below:6} entries ({below / n * 100:5.1f}%)")
# print(f"Count >= {thresh:10}: {above:6} entries ({above/n*100:5.1f}%)")
# Check min_count=109 parameter
print("\n=== ANALYSIS OF THRESHOLD 109 ===")
below_109 = sum(1 for c in counts if c < 109)
at_or_above_109 = sum(1 for c in counts if c >= 109)
print(f"Entries with count < 109: {below_109} ({below_109 / n * 100:.1f}%)")
print(
f"Entries with count >= 109: {at_or_above_109} ({at_or_above_109 / n * 100:.1f}%)"
)
# If 109 is a threshold, what's the actual min among those >= 109?
counts_ge_109 = [c for c in counts if c >= 109]
if counts_ge_109:
actual_min_ge_109 = min(counts_ge_109)
print(f"Actual min frequency among those >= 109: {actual_min_ge_109}")
# Rank-frequency analysis (Zipf's law)
print("\n=== RANK-FREQUENCY ANALYSIS (Top 100) ===")
print("Rank\tFrequency\tlog(rank)\tlog(freq)")
for rank in range(1, 101):
freq = counts_sorted_desc[rank - 1]
print(f"{rank}\t{freq}\t{math.log(rank):.3f}\t{math.log(freq):.3f}")
# Frequency spectrum (how many distinct frequencies)
freq_counter = Counter(counts)
print(f"\n=== FREQUENCY SPECTRUM ===")
print(f"Distinct frequency values: {len(freq_counter)}")
# Most common frequencies
print("\nTop 20 most common frequencies (plateau sizes):")
for freq, freq_count in freq_counter.most_common(20):
print(f" Frequency {freq}: {freq_count} entries")
# Analyze ID ranges
print("\n=== ID RANGE ANALYSIS ===")
# Build ID to count mapping
id_to_count = {}
for key, pair in pairs.items():
char_id = pair.get("id")
count = pair.get("count")
if char_id is not None and count is not None:
id_to_count[char_id] = count
ranges = [
(0, 100, "Top 100 IDs"),
(100, 500, "IDs 100-500"),
(500, 1000, "IDs 500-1000"),
(1000, 2000, "IDs 1000-2000"),
(2000, 5000, "IDs 2000-5000"),
(5000, 5500, "IDs 5000-5500 (user mentioned)"),
(5500, 6000, "IDs 5500-6000"),
(10000, 10500, "IDs 10000-10500"),
(15000, 15500, "IDs 15000-15500"),
(19000, 19500, "IDs 19000-19500 (before freq=1)"),
(19499, 20647, "IDs with freq=1"),
]
for start, end, label in ranges:
range_counts = [
id_to_count[id] for id in range(start, end) if id in id_to_count
]
if range_counts:
min_c = min(range_counts)
max_c = max(range_counts)
mean_c = sum(range_counts) / len(range_counts)
median_c = sorted(range_counts)[len(range_counts) // 2]
print(
f"{label} ({len(range_counts)} entries): min={min_c}, max={max_c}, mean={mean_c:.1f}, median={median_c}"
)
# Check if IDs are perfectly sorted by frequency
print("\n=== ID ORDER VERIFICATION ===")
all_ids = sorted(id_to_count.keys())
all_counts = [id_to_count[id] for id in all_ids]
# Check for any violations of non-increasing order
violations = 0
for i in range(1, len(all_counts)):
if all_counts[i] > all_counts[i - 1]:
violations += 1
if violations <= 5:
print(
f"Violation at ID {all_ids[i]}: {all_counts[i]} > {all_counts[i - 1]} (ID {all_ids[i - 1]})"
)
print(f"Total violations of non-increasing order: {violations}")
# Check if equal frequencies are grouped together
print("\n=== FREQUENCY GROUPING ANALYSIS ===")
current_freq = None
group_start = None
group_sizes = []
for i, (id, count) in enumerate(zip(all_ids, all_counts)):
if count != current_freq:
if current_freq is not None:
group_sizes.append(
(current_freq, group_start, all_ids[i - 1], i - group_start)
)
current_freq = count
group_start = i
# Last group
if current_freq is not None:
group_sizes.append(
(current_freq, group_start, all_ids[-1], len(all_ids) - group_start)
)
# Sort groups by size
group_sizes.sort(key=lambda x: x[3], reverse=True)
print("Top 10 largest frequency groups (plateaus):")
for freq, start_id_idx, end_id, size in group_sizes[:10]:
start_id = all_ids[start_id_idx]
print(f" Frequency {freq}: IDs {start_id}-{end_id} ({size} entries)")
# Summary for smoothing algorithm
print("\n=== SMOOTHING ALGORITHM IMPLICATIONS ===")
print("1. IDs are perfectly sorted by frequency (non-increasing).")
print(
f"2. Frequency range: {min_count} to {max_count} (ratio {max_count / min_count:.1e}:1)."
)
print(f"3. {below_109} entries ({below_109 / n * 100:.1f}%) have frequency < 109.")
print(f"4. Median frequency: {counts_sorted_desc[n // 2]}.")
print(f"5. 90% of entries have frequency <= {counts_sorted_desc[int(0.9 * n)]}.")
print(
f"6. Top 1% of entries have frequency >= {counts_sorted_desc[int(0.01 * n)]}."
)
print("7. Large frequency plateaus exist (many IDs share same frequency).")
print("8. Smoothing should handle extreme frequency ratios (1:5e8).")
# Save data for plotting
with open("rank_freq.csv", "w") as f:
f.write("rank,frequency\n")
for rank, freq in enumerate(counts_sorted_desc, 1):
f.write(f"{rank},{freq}\n")
print("\nRank-frequency data saved to rank_freq.csv")
if __name__ == "__main__":
main()