228 lines
7.3 KiB
Python
228 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive frequency distribution analysis
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import math
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
|
|
def main():
|
|
json_path = (
|
|
Path(__file__).parent.parent
|
|
/ "src"
|
|
/ "model"
|
|
/ "assets"
|
|
/ "pinyin_char_statistics.json"
|
|
)
|
|
with open(json_path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
pairs = data.get("pairs", {})
|
|
|
|
# Extract counts
|
|
counts = []
|
|
for key, pair in pairs.items():
|
|
count = pair.get("count")
|
|
if count is not None:
|
|
counts.append(count)
|
|
|
|
n = len(counts)
|
|
print(f"Total entries: {n}")
|
|
|
|
# Sort descending for rank-frequency analysis
|
|
counts_sorted_desc = sorted(counts, reverse=True)
|
|
|
|
# Basic statistics
|
|
min_count = min(counts)
|
|
max_count = max(counts)
|
|
mean_count = sum(counts) / n
|
|
|
|
# Percentiles
|
|
percentiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
|
|
print("\n=== PERCENTILE DISTRIBUTION ===")
|
|
for p in percentiles:
|
|
idx = int(p * n)
|
|
value = counts_sorted_desc[idx]
|
|
print(f"{p * 100:5.1f}%: {value:>12} (rank ~{idx})")
|
|
|
|
# Cumulative distribution
|
|
print("\n=== CUMULATIVE DISTRIBUTION ===")
|
|
thresholds = [
|
|
1,
|
|
2,
|
|
3,
|
|
5,
|
|
10,
|
|
20,
|
|
50,
|
|
100,
|
|
200,
|
|
500,
|
|
1000,
|
|
2000,
|
|
5000,
|
|
10000,
|
|
20000,
|
|
50000,
|
|
100000,
|
|
200000,
|
|
500000,
|
|
1000000,
|
|
5000000,
|
|
10000000,
|
|
50000000,
|
|
100000000,
|
|
500000000,
|
|
]
|
|
for thresh in thresholds:
|
|
if thresh > max_count:
|
|
break
|
|
below = sum(1 for c in counts if c <= thresh)
|
|
above = sum(1 for c in counts if c >= thresh)
|
|
print(f"Count <= {thresh:10}: {below:6} entries ({below / n * 100:5.1f}%)")
|
|
# print(f"Count >= {thresh:10}: {above:6} entries ({above/n*100:5.1f}%)")
|
|
|
|
# Check min_count=109 parameter
|
|
print("\n=== ANALYSIS OF THRESHOLD 109 ===")
|
|
below_109 = sum(1 for c in counts if c < 109)
|
|
at_or_above_109 = sum(1 for c in counts if c >= 109)
|
|
print(f"Entries with count < 109: {below_109} ({below_109 / n * 100:.1f}%)")
|
|
print(
|
|
f"Entries with count >= 109: {at_or_above_109} ({at_or_above_109 / n * 100:.1f}%)"
|
|
)
|
|
|
|
# If 109 is a threshold, what's the actual min among those >= 109?
|
|
counts_ge_109 = [c for c in counts if c >= 109]
|
|
if counts_ge_109:
|
|
actual_min_ge_109 = min(counts_ge_109)
|
|
print(f"Actual min frequency among those >= 109: {actual_min_ge_109}")
|
|
|
|
# Rank-frequency analysis (Zipf's law)
|
|
print("\n=== RANK-FREQUENCY ANALYSIS (Top 100) ===")
|
|
print("Rank\tFrequency\tlog(rank)\tlog(freq)")
|
|
for rank in range(1, 101):
|
|
freq = counts_sorted_desc[rank - 1]
|
|
print(f"{rank}\t{freq}\t{math.log(rank):.3f}\t{math.log(freq):.3f}")
|
|
|
|
# Frequency spectrum (how many distinct frequencies)
|
|
freq_counter = Counter(counts)
|
|
print(f"\n=== FREQUENCY SPECTRUM ===")
|
|
print(f"Distinct frequency values: {len(freq_counter)}")
|
|
|
|
# Most common frequencies
|
|
print("\nTop 20 most common frequencies (plateau sizes):")
|
|
for freq, freq_count in freq_counter.most_common(20):
|
|
print(f" Frequency {freq}: {freq_count} entries")
|
|
|
|
# Analyze ID ranges
|
|
print("\n=== ID RANGE ANALYSIS ===")
|
|
# Build ID to count mapping
|
|
id_to_count = {}
|
|
for key, pair in pairs.items():
|
|
char_id = pair.get("id")
|
|
count = pair.get("count")
|
|
if char_id is not None and count is not None:
|
|
id_to_count[char_id] = count
|
|
|
|
ranges = [
|
|
(0, 100, "Top 100 IDs"),
|
|
(100, 500, "IDs 100-500"),
|
|
(500, 1000, "IDs 500-1000"),
|
|
(1000, 2000, "IDs 1000-2000"),
|
|
(2000, 5000, "IDs 2000-5000"),
|
|
(5000, 5500, "IDs 5000-5500 (user mentioned)"),
|
|
(5500, 6000, "IDs 5500-6000"),
|
|
(10000, 10500, "IDs 10000-10500"),
|
|
(15000, 15500, "IDs 15000-15500"),
|
|
(19000, 19500, "IDs 19000-19500 (before freq=1)"),
|
|
(19499, 20647, "IDs with freq=1"),
|
|
]
|
|
|
|
for start, end, label in ranges:
|
|
range_counts = [
|
|
id_to_count[id] for id in range(start, end) if id in id_to_count
|
|
]
|
|
if range_counts:
|
|
min_c = min(range_counts)
|
|
max_c = max(range_counts)
|
|
mean_c = sum(range_counts) / len(range_counts)
|
|
median_c = sorted(range_counts)[len(range_counts) // 2]
|
|
print(
|
|
f"{label} ({len(range_counts)} entries): min={min_c}, max={max_c}, mean={mean_c:.1f}, median={median_c}"
|
|
)
|
|
|
|
# Check if IDs are perfectly sorted by frequency
|
|
print("\n=== ID ORDER VERIFICATION ===")
|
|
all_ids = sorted(id_to_count.keys())
|
|
all_counts = [id_to_count[id] for id in all_ids]
|
|
|
|
# Check for any violations of non-increasing order
|
|
violations = 0
|
|
for i in range(1, len(all_counts)):
|
|
if all_counts[i] > all_counts[i - 1]:
|
|
violations += 1
|
|
if violations <= 5:
|
|
print(
|
|
f"Violation at ID {all_ids[i]}: {all_counts[i]} > {all_counts[i - 1]} (ID {all_ids[i - 1]})"
|
|
)
|
|
|
|
print(f"Total violations of non-increasing order: {violations}")
|
|
|
|
# Check if equal frequencies are grouped together
|
|
print("\n=== FREQUENCY GROUPING ANALYSIS ===")
|
|
current_freq = None
|
|
group_start = None
|
|
group_sizes = []
|
|
|
|
for i, (id, count) in enumerate(zip(all_ids, all_counts)):
|
|
if count != current_freq:
|
|
if current_freq is not None:
|
|
group_sizes.append(
|
|
(current_freq, group_start, all_ids[i - 1], i - group_start)
|
|
)
|
|
current_freq = count
|
|
group_start = i
|
|
|
|
# Last group
|
|
if current_freq is not None:
|
|
group_sizes.append(
|
|
(current_freq, group_start, all_ids[-1], len(all_ids) - group_start)
|
|
)
|
|
|
|
# Sort groups by size
|
|
group_sizes.sort(key=lambda x: x[3], reverse=True)
|
|
print("Top 10 largest frequency groups (plateaus):")
|
|
for freq, start_id_idx, end_id, size in group_sizes[:10]:
|
|
start_id = all_ids[start_id_idx]
|
|
print(f" Frequency {freq}: IDs {start_id}-{end_id} ({size} entries)")
|
|
|
|
# Summary for smoothing algorithm
|
|
print("\n=== SMOOTHING ALGORITHM IMPLICATIONS ===")
|
|
print("1. IDs are perfectly sorted by frequency (non-increasing).")
|
|
print(
|
|
f"2. Frequency range: {min_count} to {max_count} (ratio {max_count / min_count:.1e}:1)."
|
|
)
|
|
print(f"3. {below_109} entries ({below_109 / n * 100:.1f}%) have frequency < 109.")
|
|
print(f"4. Median frequency: {counts_sorted_desc[n // 2]}.")
|
|
print(f"5. 90% of entries have frequency <= {counts_sorted_desc[int(0.9 * n)]}.")
|
|
print(
|
|
f"6. Top 1% of entries have frequency >= {counts_sorted_desc[int(0.01 * n)]}."
|
|
)
|
|
print("7. Large frequency plateaus exist (many IDs share same frequency).")
|
|
print("8. Smoothing should handle extreme frequency ratios (1:5e8).")
|
|
|
|
# Save data for plotting
|
|
with open("rank_freq.csv", "w") as f:
|
|
f.write("rank,frequency\n")
|
|
for rank, freq in enumerate(counts_sorted_desc, 1):
|
|
f.write(f"{rank},{freq}\n")
|
|
print("\nRank-frequency data saved to rank_freq.csv")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|