SUimeModelTraner/scripts/analyze_frequency.py

267 lines
9.4 KiB
Python

#!/usr/bin/env python3
"""
Analyze frequency distribution in pinyin_char_statistics.json
"""
import json
import sys
import os
import math
from collections import defaultdict
from pathlib import Path
def main():
# Path to the JSON file
json_path = (
Path(__file__).parent.parent
/ "src"
/ "model"
/ "assets"
/ "pinyin_char_statistics.json"
)
if not json_path.exists():
print(f"Error: File not found: {json_path}")
sys.exit(1)
print(f"Loading {json_path}...")
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
print(f"Timestamp: {data.get('timestamp')}")
print(f"Total characters: {data.get('total_characters')}")
print(f"Total pinyins: {data.get('total_pinyins')}")
print(f"Valid input character count: {data.get('valid_input_character_count')}")
pairs = data.get("pairs", {})
print(f"Number of pairs: {len(pairs)}")
# Extract counts and IDs
counts = []
id_to_count = {}
char_to_count = {}
for key, pair in pairs.items():
try:
char_id = pair.get("id")
count = pair.get("count")
char = pair.get("char", "")
if char_id is not None and count is not None:
counts.append(count)
id_to_count[char_id] = count
if char:
char_to_count[char] = count
except (ValueError, TypeError) as e:
print(f"Warning: Could not parse pair {key}: {e}")
continue
if not counts:
print("No valid count data found.")
return
# Basic statistics
min_count = min(counts)
max_count = max(counts)
total_count = sum(counts)
mean_count = total_count / len(counts)
# Sort counts for percentiles
sorted_counts = sorted(counts)
n = len(sorted_counts)
# Percentiles
p10 = sorted_counts[int(0.1 * n)]
p25 = sorted_counts[int(0.25 * n)]
p50 = sorted_counts[int(0.5 * n)]
p75 = sorted_counts[int(0.75 * n)]
p90 = sorted_counts[int(0.9 * n)]
p99 = sorted_counts[int(0.99 * n)]
# Variance and std dev
variance = sum((x - mean_count) ** 2 for x in counts) / n
std_dev = math.sqrt(variance)
print("\n=== BASIC STATISTICS ===")
print(f"Min frequency: {min_count}")
print(f"Max frequency: {max_count}")
print(f"Mean frequency: {mean_count:.2f}")
print(f"Standard deviation: {std_dev:.2f}")
print(f"Total frequency sum: {total_count}")
print(f"Number of entries: {n}")
print("\n=== PERCENTILES ===")
print(f"10th percentile: {p10}")
print(f"25th percentile: {p25}")
print(f"50th percentile (median): {p50}")
print(f"75th percentile: {p75}")
print(f"90th percentile: {p90}")
print(f"99th percentile: {p99}")
# Find IDs with min and max counts
min_ids = [id for id, count in id_to_count.items() if count == min_count]
max_ids = [id for id, count in id_to_count.items() if count == max_count]
print(f"\nIDs with min frequency ({min_count}): {min_ids}")
print(f"IDs with max frequency ({max_count}): {max_ids}")
# Check if IDs are assigned in frequency order
# Compute correlation between ID and count
ids = list(id_to_count.keys())
id_counts = [id_to_count[id] for id in ids]
# Sort by ID and check if counts are decreasing
sorted_by_id = sorted(ids)
counts_by_id = [id_to_count[id] for id in sorted_by_id]
# Calculate monotonicity: count of times count decreases as ID increases
decreases = 0
increases = 0
for i in range(1, len(counts_by_id)):
if counts_by_id[i] < counts_by_id[i - 1]:
decreases += 1
elif counts_by_id[i] > counts_by_id[i - 1]:
increases += 1
print(f"\n=== ID ORDER ANALYSIS ===")
print(f"Total pairs: {len(counts_by_id)}")
print(f"Decreases as ID increases: {decreases} times")
print(f"Increases as ID increases: {increases} times")
print(f"Percentage decreasing: {decreases / (len(counts_by_id) - 1) * 100:.2f}%")
# Check if IDs are roughly sorted by frequency
# Compute Spearman rank correlation (simplified)
sorted_by_count = sorted(ids, key=lambda x: id_to_count[x], reverse=True)
rank_by_id = {id: i for i, id in enumerate(sorted_by_id)}
rank_by_count = {id: i for i, id in enumerate(sorted_by_count)}
# Average rank difference
rank_diffs = [abs(rank_by_id[id] - rank_by_count[id]) for id in ids]
avg_rank_diff = sum(rank_diffs) / len(rank_diffs)
max_rank_diff = max(rank_diffs)
print(
f"Average rank difference between ID order and frequency order: {avg_rank_diff:.2f}"
)
print(f"Maximum rank difference: {max_rank_diff}")
# Analyze specific ID range 5000-5500
print("\n=== ANALYSIS OF ID RANGE 5000-5500 ===")
range_counts = []
range_ids = []
for id in range(5000, 5501):
if id in id_to_count:
range_counts.append(id_to_count[id])
range_ids.append(id)
if range_counts:
range_min = min(range_counts)
range_max = max(range_counts)
range_mean = sum(range_counts) / len(range_counts)
range_sorted = sorted(range_counts)
range_n = len(range_counts)
range_p10 = range_sorted[int(0.1 * range_n)] if range_n > 0 else 0
range_p50 = range_sorted[int(0.5 * range_n)] if range_n > 0 else 0
range_p90 = range_sorted[int(0.9 * range_n)] if range_n > 0 else 0
print(f"IDs in range 5000-5500: {len(range_counts)}")
print(f"Min frequency in range: {range_min}")
print(f"Max frequency in range: {range_max}")
print(f"Mean frequency in range: {range_mean:.2f}")
print(f"10th percentile in range: {range_p10}")
print(f"50th percentile in range: {range_p50}")
print(f"90th percentile in range: {range_p90}")
# Find IDs with min frequency in this range
min_in_range_ids = [id for id in range_ids if id_to_count[id] == range_min]
print(
f"IDs with min frequency in range: {min_in_range_ids[:10]}{'...' if len(min_in_range_ids) > 10 else ''}"
)
else:
print("No IDs found in range 5000-5500")
# Histogram of frequencies (log bins)
print("\n=== FREQUENCY DISTRIBUTION (LOG BINS) ===")
if max_count > 0:
log_min = math.log10(min_count) if min_count > 0 else 0
log_max = math.log10(max_count)
num_bins = 20
bin_edges = [
10 ** (log_min + i * (log_max - log_min) / num_bins)
for i in range(num_bins + 1)
]
hist = [0] * num_bins
for count in counts:
if count > 0:
log_val = math.log10(count)
bin_idx = min(
int((log_val - log_min) / (log_max - log_min) * num_bins),
num_bins - 1,
)
hist[bin_idx] += 1
print("Log-scale histogram (count range -> frequency count):")
for i in range(num_bins):
if hist[i] > 0:
lower = bin_edges[i]
upper = bin_edges[i + 1]
print(f" {lower:.2e} - {upper:.2e}: {hist[i]} entries")
# Check for zero or near-zero frequencies
zero_count = sum(1 for c in counts if c == 0)
low_count = sum(1 for c in counts if 0 < c <= 10)
very_low_count = sum(1 for c in counts if 0 < c <= 100)
print(f"\n=== LOW FREQUENCY ANALYSIS ===")
print(f"Entries with zero frequency: {zero_count}")
print(f"Entries with frequency <= 10: {low_count}")
print(f"Entries with frequency <= 100: {very_low_count}")
# Find the actual min frequency (excluding zeros if any)
non_zero_counts = [c for c in counts if c > 0]
if non_zero_counts:
actual_min = min(non_zero_counts)
print(f"Actual min frequency (non-zero): {actual_min}")
actual_min_ids = [
id for id, count in id_to_count.items() if count == actual_min
]
print(
f"IDs with actual min frequency: {actual_min_ids[:10]}{'...' if len(actual_min_ids) > 10 else ''}"
)
# Summary for smoothing algorithm design
print("\n=== SUMMARY FOR SMOOTHING ALGORITHM DESIGN ===")
print(
f"Frequency range spans {max_count / min_count if min_count > 0 else 'inf'}:1 ratio"
)
print(f"Most entries ({p50}) have frequency around {p50}")
print(f"Top 10% of entries have frequency > {p90}")
print(f"Bottom 10% of entries have frequency < {p10}")
print(
f"ID order is {'roughly' if decreases > increases else 'not'} sorted by frequency"
)
# Save detailed data for further analysis
output_file = "frequency_analysis_results.txt"
with open(output_file, "w", encoding="utf-8") as f:
f.write("Frequency Analysis Results\n")
f.write("=" * 50 + "\n")
f.write(f"Min frequency: {min_count}\n")
f.write(f"Max frequency: {max_count}\n")
f.write(f"Mean frequency: {mean_count:.2f}\n")
f.write(f"Standard deviation: {std_dev:.2f}\n")
f.write(f"10th percentile: {p10}\n")
f.write(f"50th percentile: {p50}\n")
f.write(f"90th percentile: {p90}\n")
f.write(
f"IDs in range 5000-5500 min: {range_min if 'range_min' in locals() else 'N/A'}\n"
)
f.write(
f"IDs in range 5000-5500 max: {range_max if 'range_max' in locals() else 'N/A'}\n"
)
print(f"\nDetailed results saved to {output_file}")
if __name__ == "__main__":
main()