SUimeModelTraner/analyze_frequency.py

237 lines
9.1 KiB
Python

#!/usr/bin/env python3
"""
Analyze frequency distribution in pinyin_char_statistics.json
"""
import json
import sys
import os
import math
from collections import defaultdict
from pathlib import Path
def main():
# Path to the JSON file
json_path = Path("src/model/assets/pinyin_char_statistics.json")
if not json_path.exists():
print(f"Error: File not found: {json_path}")
sys.exit(1)
print(f"Loading {json_path}...")
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"Timestamp: {data.get('timestamp')}")
print(f"Total characters: {data.get('total_characters')}")
print(f"Total pinyins: {data.get('total_pinyins')}")
print(f"Valid input character count: {data.get('valid_input_character_count')}")
pairs = data.get('pairs', {})
print(f"Number of pairs: {len(pairs)}")
# Extract counts and IDs
counts = []
id_to_count = {}
char_to_count = {}
for key, pair in pairs.items():
try:
char_id = pair.get('id')
count = pair.get('count')
char = pair.get('char', '')
if char_id is not None and count is not None:
counts.append(count)
id_to_count[char_id] = count
if char:
char_to_count[char] = count
except (ValueError, TypeError) as e:
print(f"Warning: Could not parse pair {key}: {e}")
continue
if not counts:
print("No valid count data found.")
return
# Basic statistics
min_count = min(counts)
max_count = max(counts)
total_count = sum(counts)
mean_count = total_count / len(counts)
# Sort counts for percentiles
sorted_counts = sorted(counts)
n = len(sorted_counts)
# Percentiles
p10 = sorted_counts[int(0.1 * n)]
p25 = sorted_counts[int(0.25 * n)]
p50 = sorted_counts[int(0.5 * n)]
p75 = sorted_counts[int(0.75 * n)]
p90 = sorted_counts[int(0.9 * n)]
p99 = sorted_counts[int(0.99 * n)]
# Variance and std dev
variance = sum((x - mean_count) ** 2 for x in counts) / n
std_dev = math.sqrt(variance)
print("\n=== BASIC STATISTICS ===")
print(f"Min frequency: {min_count}")
print(f"Max frequency: {max_count}")
print(f"Mean frequency: {mean_count:.2f}")
print(f"Standard deviation: {std_dev:.2f}")
print(f"Total frequency sum: {total_count}")
print(f"Number of entries: {n}")
print("\n=== PERCENTILES ===")
print(f"10th percentile: {p10}")
print(f"25th percentile: {p25}")
print(f"50th percentile (median): {p50}")
print(f"75th percentile: {p75}")
print(f"90th percentile: {p90}")
print(f"99th percentile: {p99}")
# Find IDs with min and max counts
min_ids = [id for id, count in id_to_count.items() if count == min_count]
max_ids = [id for id, count in id_to_count.items() if count == max_count]
print(f"\nIDs with min frequency ({min_count}): {min_ids}")
print(f"IDs with max frequency ({max_count}): {max_ids}")
# Check if IDs are assigned in frequency order
# Compute correlation between ID and count
ids = list(id_to_count.keys())
id_counts = [id_to_count[id] for id in ids]
# Sort by ID and check if counts are decreasing
sorted_by_id = sorted(ids)
counts_by_id = [id_to_count[id] for id in sorted_by_id]
# Calculate monotonicity: count of times count decreases as ID increases
decreases = 0
increases = 0
for i in range(1, len(counts_by_id)):
if counts_by_id[i] < counts_by_id[i-1]:
decreases += 1
elif counts_by_id[i] > counts_by_id[i-1]:
increases += 1
print(f"\n=== ID ORDER ANALYSIS ===")
print(f"Total pairs: {len(counts_by_id)}")
print(f"Decreases as ID increases: {decreases} times")
print(f"Increases as ID increases: {increases} times")
print(f"Percentage decreasing: {decreases/(len(counts_by_id)-1)*100:.2f}%")
# Check if IDs are roughly sorted by frequency
# Compute Spearman rank correlation (simplified)
sorted_by_count = sorted(ids, key=lambda x: id_to_count[x], reverse=True)
rank_by_id = {id: i for i, id in enumerate(sorted_by_id)}
rank_by_count = {id: i for i, id in enumerate(sorted_by_count)}
# Average rank difference
rank_diffs = [abs(rank_by_id[id] - rank_by_count[id]) for id in ids]
avg_rank_diff = sum(rank_diffs) / len(rank_diffs)
max_rank_diff = max(rank_diffs)
print(f"Average rank difference between ID order and frequency order: {avg_rank_diff:.2f}")
print(f"Maximum rank difference: {max_rank_diff}")
# Analyze specific ID range 5000-5500
print("\n=== ANALYSIS OF ID RANGE 5000-5500 ===")
range_counts = []
range_ids = []
for id in range(5000, 5501):
if id in id_to_count:
range_counts.append(id_to_count[id])
range_ids.append(id)
if range_counts:
range_min = min(range_counts)
range_max = max(range_counts)
range_mean = sum(range_counts) / len(range_counts)
range_sorted = sorted(range_counts)
range_n = len(range_counts)
range_p10 = range_sorted[int(0.1 * range_n)] if range_n > 0 else 0
range_p50 = range_sorted[int(0.5 * range_n)] if range_n > 0 else 0
range_p90 = range_sorted[int(0.9 * range_n)] if range_n > 0 else 0
print(f"IDs in range 5000-5500: {len(range_counts)}")
print(f"Min frequency in range: {range_min}")
print(f"Max frequency in range: {range_max}")
print(f"Mean frequency in range: {range_mean:.2f}")
print(f"10th percentile in range: {range_p10}")
print(f"50th percentile in range: {range_p50}")
print(f"90th percentile in range: {range_p90}")
# Find IDs with min frequency in this range
min_in_range_ids = [id for id in range_ids if id_to_count[id] == range_min]
print(f"IDs with min frequency in range: {min_in_range_ids[:10]}{'...' if len(min_in_range_ids) > 10 else ''}")
else:
print("No IDs found in range 5000-5500")
# Histogram of frequencies (log bins)
print("\n=== FREQUENCY DISTRIBUTION (LOG BINS) ===")
if max_count > 0:
log_min = math.log10(min_count) if min_count > 0 else 0
log_max = math.log10(max_count)
num_bins = 20
bin_edges = [10**(log_min + i*(log_max-log_min)/num_bins) for i in range(num_bins+1)]
hist = [0] * num_bins
for count in counts:
if count > 0:
log_val = math.log10(count)
bin_idx = min(int((log_val - log_min) / (log_max - log_min) * num_bins), num_bins-1)
hist[bin_idx] += 1
print("Log-scale histogram (count range -> frequency count):")
for i in range(num_bins):
if hist[i] > 0:
lower = bin_edges[i]
upper = bin_edges[i+1]
print(f" {lower:.2e} - {upper:.2e}: {hist[i]} entries")
# Check for zero or near-zero frequencies
zero_count = sum(1 for c in counts if c == 0)
low_count = sum(1 for c in counts if 0 < c <= 10)
very_low_count = sum(1 for c in counts if 0 < c <= 100)
print(f"\n=== LOW FREQUENCY ANALYSIS ===")
print(f"Entries with zero frequency: {zero_count}")
print(f"Entries with frequency <= 10: {low_count}")
print(f"Entries with frequency <= 100: {very_low_count}")
# Find the actual min frequency (excluding zeros if any)
non_zero_counts = [c for c in counts if c > 0]
if non_zero_counts:
actual_min = min(non_zero_counts)
print(f"Actual min frequency (non-zero): {actual_min}")
actual_min_ids = [id for id, count in id_to_count.items() if count == actual_min]
print(f"IDs with actual min frequency: {actual_min_ids[:10]}{'...' if len(actual_min_ids) > 10 else ''}")
# Summary for smoothing algorithm design
print("\n=== SUMMARY FOR SMOOTHING ALGORITHM DESIGN ===")
print(f"Frequency range spans {max_count/min_count if min_count>0 else 'inf'}:1 ratio")
print(f"Most entries ({p50}) have frequency around {p50}")
print(f"Top 10% of entries have frequency > {p90}")
print(f"Bottom 10% of entries have frequency < {p10}")
print(f"ID order is {'roughly' if decreases > increases else 'not'} sorted by frequency")
# Save detailed data for further analysis
output_file = "frequency_analysis_results.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write("Frequency Analysis Results\n")
f.write("="*50 + "\n")
f.write(f"Min frequency: {min_count}\n")
f.write(f"Max frequency: {max_count}\n")
f.write(f"Mean frequency: {mean_count:.2f}\n")
f.write(f"Standard deviation: {std_dev:.2f}\n")
f.write(f"10th percentile: {p10}\n")
f.write(f"50th percentile: {p50}\n")
f.write(f"90th percentile: {p90}\n")
f.write(f"IDs in range 5000-5500 min: {range_min if 'range_min' in locals() else 'N/A'}\n")
f.write(f"IDs in range 5000-5500 max: {range_max if 'range_max' in locals() else 'N/A'}\n")
print(f"\nDetailed results saved to {output_file}")
if __name__ == "__main__":
main()