237 lines
9.1 KiB
Python
237 lines
9.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze frequency distribution in pinyin_char_statistics.json
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import os
|
|
import math
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
def main():
|
|
# Path to the JSON file
|
|
json_path = Path("src/model/assets/pinyin_char_statistics.json")
|
|
if not json_path.exists():
|
|
print(f"Error: File not found: {json_path}")
|
|
sys.exit(1)
|
|
|
|
print(f"Loading {json_path}...")
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
print(f"Timestamp: {data.get('timestamp')}")
|
|
print(f"Total characters: {data.get('total_characters')}")
|
|
print(f"Total pinyins: {data.get('total_pinyins')}")
|
|
print(f"Valid input character count: {data.get('valid_input_character_count')}")
|
|
|
|
pairs = data.get('pairs', {})
|
|
print(f"Number of pairs: {len(pairs)}")
|
|
|
|
# Extract counts and IDs
|
|
counts = []
|
|
id_to_count = {}
|
|
char_to_count = {}
|
|
for key, pair in pairs.items():
|
|
try:
|
|
char_id = pair.get('id')
|
|
count = pair.get('count')
|
|
char = pair.get('char', '')
|
|
if char_id is not None and count is not None:
|
|
counts.append(count)
|
|
id_to_count[char_id] = count
|
|
if char:
|
|
char_to_count[char] = count
|
|
except (ValueError, TypeError) as e:
|
|
print(f"Warning: Could not parse pair {key}: {e}")
|
|
continue
|
|
|
|
if not counts:
|
|
print("No valid count data found.")
|
|
return
|
|
|
|
# Basic statistics
|
|
min_count = min(counts)
|
|
max_count = max(counts)
|
|
total_count = sum(counts)
|
|
mean_count = total_count / len(counts)
|
|
|
|
# Sort counts for percentiles
|
|
sorted_counts = sorted(counts)
|
|
n = len(sorted_counts)
|
|
|
|
# Percentiles
|
|
p10 = sorted_counts[int(0.1 * n)]
|
|
p25 = sorted_counts[int(0.25 * n)]
|
|
p50 = sorted_counts[int(0.5 * n)]
|
|
p75 = sorted_counts[int(0.75 * n)]
|
|
p90 = sorted_counts[int(0.9 * n)]
|
|
p99 = sorted_counts[int(0.99 * n)]
|
|
|
|
# Variance and std dev
|
|
variance = sum((x - mean_count) ** 2 for x in counts) / n
|
|
std_dev = math.sqrt(variance)
|
|
|
|
print("\n=== BASIC STATISTICS ===")
|
|
print(f"Min frequency: {min_count}")
|
|
print(f"Max frequency: {max_count}")
|
|
print(f"Mean frequency: {mean_count:.2f}")
|
|
print(f"Standard deviation: {std_dev:.2f}")
|
|
print(f"Total frequency sum: {total_count}")
|
|
print(f"Number of entries: {n}")
|
|
|
|
print("\n=== PERCENTILES ===")
|
|
print(f"10th percentile: {p10}")
|
|
print(f"25th percentile: {p25}")
|
|
print(f"50th percentile (median): {p50}")
|
|
print(f"75th percentile: {p75}")
|
|
print(f"90th percentile: {p90}")
|
|
print(f"99th percentile: {p99}")
|
|
|
|
# Find IDs with min and max counts
|
|
min_ids = [id for id, count in id_to_count.items() if count == min_count]
|
|
max_ids = [id for id, count in id_to_count.items() if count == max_count]
|
|
|
|
print(f"\nIDs with min frequency ({min_count}): {min_ids}")
|
|
print(f"IDs with max frequency ({max_count}): {max_ids}")
|
|
|
|
# Check if IDs are assigned in frequency order
|
|
# Compute correlation between ID and count
|
|
ids = list(id_to_count.keys())
|
|
id_counts = [id_to_count[id] for id in ids]
|
|
|
|
# Sort by ID and check if counts are decreasing
|
|
sorted_by_id = sorted(ids)
|
|
counts_by_id = [id_to_count[id] for id in sorted_by_id]
|
|
|
|
# Calculate monotonicity: count of times count decreases as ID increases
|
|
decreases = 0
|
|
increases = 0
|
|
for i in range(1, len(counts_by_id)):
|
|
if counts_by_id[i] < counts_by_id[i-1]:
|
|
decreases += 1
|
|
elif counts_by_id[i] > counts_by_id[i-1]:
|
|
increases += 1
|
|
|
|
print(f"\n=== ID ORDER ANALYSIS ===")
|
|
print(f"Total pairs: {len(counts_by_id)}")
|
|
print(f"Decreases as ID increases: {decreases} times")
|
|
print(f"Increases as ID increases: {increases} times")
|
|
print(f"Percentage decreasing: {decreases/(len(counts_by_id)-1)*100:.2f}%")
|
|
|
|
# Check if IDs are roughly sorted by frequency
|
|
# Compute Spearman rank correlation (simplified)
|
|
sorted_by_count = sorted(ids, key=lambda x: id_to_count[x], reverse=True)
|
|
rank_by_id = {id: i for i, id in enumerate(sorted_by_id)}
|
|
rank_by_count = {id: i for i, id in enumerate(sorted_by_count)}
|
|
|
|
# Average rank difference
|
|
rank_diffs = [abs(rank_by_id[id] - rank_by_count[id]) for id in ids]
|
|
avg_rank_diff = sum(rank_diffs) / len(rank_diffs)
|
|
max_rank_diff = max(rank_diffs)
|
|
|
|
print(f"Average rank difference between ID order and frequency order: {avg_rank_diff:.2f}")
|
|
print(f"Maximum rank difference: {max_rank_diff}")
|
|
|
|
# Analyze specific ID range 5000-5500
|
|
print("\n=== ANALYSIS OF ID RANGE 5000-5500 ===")
|
|
range_counts = []
|
|
range_ids = []
|
|
for id in range(5000, 5501):
|
|
if id in id_to_count:
|
|
range_counts.append(id_to_count[id])
|
|
range_ids.append(id)
|
|
|
|
if range_counts:
|
|
range_min = min(range_counts)
|
|
range_max = max(range_counts)
|
|
range_mean = sum(range_counts) / len(range_counts)
|
|
range_sorted = sorted(range_counts)
|
|
range_n = len(range_counts)
|
|
range_p10 = range_sorted[int(0.1 * range_n)] if range_n > 0 else 0
|
|
range_p50 = range_sorted[int(0.5 * range_n)] if range_n > 0 else 0
|
|
range_p90 = range_sorted[int(0.9 * range_n)] if range_n > 0 else 0
|
|
|
|
print(f"IDs in range 5000-5500: {len(range_counts)}")
|
|
print(f"Min frequency in range: {range_min}")
|
|
print(f"Max frequency in range: {range_max}")
|
|
print(f"Mean frequency in range: {range_mean:.2f}")
|
|
print(f"10th percentile in range: {range_p10}")
|
|
print(f"50th percentile in range: {range_p50}")
|
|
print(f"90th percentile in range: {range_p90}")
|
|
|
|
# Find IDs with min frequency in this range
|
|
min_in_range_ids = [id for id in range_ids if id_to_count[id] == range_min]
|
|
print(f"IDs with min frequency in range: {min_in_range_ids[:10]}{'...' if len(min_in_range_ids) > 10 else ''}")
|
|
else:
|
|
print("No IDs found in range 5000-5500")
|
|
|
|
# Histogram of frequencies (log bins)
|
|
print("\n=== FREQUENCY DISTRIBUTION (LOG BINS) ===")
|
|
if max_count > 0:
|
|
log_min = math.log10(min_count) if min_count > 0 else 0
|
|
log_max = math.log10(max_count)
|
|
num_bins = 20
|
|
bin_edges = [10**(log_min + i*(log_max-log_min)/num_bins) for i in range(num_bins+1)]
|
|
|
|
hist = [0] * num_bins
|
|
for count in counts:
|
|
if count > 0:
|
|
log_val = math.log10(count)
|
|
bin_idx = min(int((log_val - log_min) / (log_max - log_min) * num_bins), num_bins-1)
|
|
hist[bin_idx] += 1
|
|
|
|
print("Log-scale histogram (count range -> frequency count):")
|
|
for i in range(num_bins):
|
|
if hist[i] > 0:
|
|
lower = bin_edges[i]
|
|
upper = bin_edges[i+1]
|
|
print(f" {lower:.2e} - {upper:.2e}: {hist[i]} entries")
|
|
|
|
# Check for zero or near-zero frequencies
|
|
zero_count = sum(1 for c in counts if c == 0)
|
|
low_count = sum(1 for c in counts if 0 < c <= 10)
|
|
very_low_count = sum(1 for c in counts if 0 < c <= 100)
|
|
|
|
print(f"\n=== LOW FREQUENCY ANALYSIS ===")
|
|
print(f"Entries with zero frequency: {zero_count}")
|
|
print(f"Entries with frequency <= 10: {low_count}")
|
|
print(f"Entries with frequency <= 100: {very_low_count}")
|
|
|
|
# Find the actual min frequency (excluding zeros if any)
|
|
non_zero_counts = [c for c in counts if c > 0]
|
|
if non_zero_counts:
|
|
actual_min = min(non_zero_counts)
|
|
print(f"Actual min frequency (non-zero): {actual_min}")
|
|
actual_min_ids = [id for id, count in id_to_count.items() if count == actual_min]
|
|
print(f"IDs with actual min frequency: {actual_min_ids[:10]}{'...' if len(actual_min_ids) > 10 else ''}")
|
|
|
|
# Summary for smoothing algorithm design
|
|
print("\n=== SUMMARY FOR SMOOTHING ALGORITHM DESIGN ===")
|
|
print(f"Frequency range spans {max_count/min_count if min_count>0 else 'inf'}:1 ratio")
|
|
print(f"Most entries ({p50}) have frequency around {p50}")
|
|
print(f"Top 10% of entries have frequency > {p90}")
|
|
print(f"Bottom 10% of entries have frequency < {p10}")
|
|
print(f"ID order is {'roughly' if decreases > increases else 'not'} sorted by frequency")
|
|
|
|
# Save detailed data for further analysis
|
|
output_file = "frequency_analysis_results.txt"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("Frequency Analysis Results\n")
|
|
f.write("="*50 + "\n")
|
|
f.write(f"Min frequency: {min_count}\n")
|
|
f.write(f"Max frequency: {max_count}\n")
|
|
f.write(f"Mean frequency: {mean_count:.2f}\n")
|
|
f.write(f"Standard deviation: {std_dev:.2f}\n")
|
|
f.write(f"10th percentile: {p10}\n")
|
|
f.write(f"50th percentile: {p50}\n")
|
|
f.write(f"90th percentile: {p90}\n")
|
|
f.write(f"IDs in range 5000-5500 min: {range_min if 'range_min' in locals() else 'N/A'}\n")
|
|
f.write(f"IDs in range 5000-5500 max: {range_max if 'range_max' in locals() else 'N/A'}\n")
|
|
|
|
print(f"\nDetailed results saved to {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|