#!/usr/bin/env python3 """ Analyze frequency distribution in pinyin_char_statistics.json """ import json import sys import os import math from collections import defaultdict from pathlib import Path def main(): # Path to the JSON file json_path = Path("src/model/assets/pinyin_char_statistics.json") if not json_path.exists(): print(f"Error: File not found: {json_path}") sys.exit(1) print(f"Loading {json_path}...") with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) print(f"Timestamp: {data.get('timestamp')}") print(f"Total characters: {data.get('total_characters')}") print(f"Total pinyins: {data.get('total_pinyins')}") print(f"Valid input character count: {data.get('valid_input_character_count')}") pairs = data.get('pairs', {}) print(f"Number of pairs: {len(pairs)}") # Extract counts and IDs counts = [] id_to_count = {} char_to_count = {} for key, pair in pairs.items(): try: char_id = pair.get('id') count = pair.get('count') char = pair.get('char', '') if char_id is not None and count is not None: counts.append(count) id_to_count[char_id] = count if char: char_to_count[char] = count except (ValueError, TypeError) as e: print(f"Warning: Could not parse pair {key}: {e}") continue if not counts: print("No valid count data found.") return # Basic statistics min_count = min(counts) max_count = max(counts) total_count = sum(counts) mean_count = total_count / len(counts) # Sort counts for percentiles sorted_counts = sorted(counts) n = len(sorted_counts) # Percentiles p10 = sorted_counts[int(0.1 * n)] p25 = sorted_counts[int(0.25 * n)] p50 = sorted_counts[int(0.5 * n)] p75 = sorted_counts[int(0.75 * n)] p90 = sorted_counts[int(0.9 * n)] p99 = sorted_counts[int(0.99 * n)] # Variance and std dev variance = sum((x - mean_count) ** 2 for x in counts) / n std_dev = math.sqrt(variance) print("\n=== BASIC STATISTICS ===") print(f"Min frequency: {min_count}") print(f"Max frequency: {max_count}") print(f"Mean frequency: {mean_count:.2f}") print(f"Standard deviation: {std_dev:.2f}") print(f"Total frequency sum: {total_count}") print(f"Number of entries: {n}") print("\n=== PERCENTILES ===") print(f"10th percentile: {p10}") print(f"25th percentile: {p25}") print(f"50th percentile (median): {p50}") print(f"75th percentile: {p75}") print(f"90th percentile: {p90}") print(f"99th percentile: {p99}") # Find IDs with min and max counts min_ids = [id for id, count in id_to_count.items() if count == min_count] max_ids = [id for id, count in id_to_count.items() if count == max_count] print(f"\nIDs with min frequency ({min_count}): {min_ids}") print(f"IDs with max frequency ({max_count}): {max_ids}") # Check if IDs are assigned in frequency order # Compute correlation between ID and count ids = list(id_to_count.keys()) id_counts = [id_to_count[id] for id in ids] # Sort by ID and check if counts are decreasing sorted_by_id = sorted(ids) counts_by_id = [id_to_count[id] for id in sorted_by_id] # Calculate monotonicity: count of times count decreases as ID increases decreases = 0 increases = 0 for i in range(1, len(counts_by_id)): if counts_by_id[i] < counts_by_id[i-1]: decreases += 1 elif counts_by_id[i] > counts_by_id[i-1]: increases += 1 print(f"\n=== ID ORDER ANALYSIS ===") print(f"Total pairs: {len(counts_by_id)}") print(f"Decreases as ID increases: {decreases} times") print(f"Increases as ID increases: {increases} times") print(f"Percentage decreasing: {decreases/(len(counts_by_id)-1)*100:.2f}%") # Check if IDs are roughly sorted by frequency # Compute Spearman rank correlation (simplified) sorted_by_count = sorted(ids, key=lambda x: id_to_count[x], reverse=True) rank_by_id = {id: i for i, id in enumerate(sorted_by_id)} rank_by_count = {id: i for i, id in enumerate(sorted_by_count)} # Average rank difference rank_diffs = [abs(rank_by_id[id] - rank_by_count[id]) for id in ids] avg_rank_diff = sum(rank_diffs) / len(rank_diffs) max_rank_diff = max(rank_diffs) print(f"Average rank difference between ID order and frequency order: {avg_rank_diff:.2f}") print(f"Maximum rank difference: {max_rank_diff}") # Analyze specific ID range 5000-5500 print("\n=== ANALYSIS OF ID RANGE 5000-5500 ===") range_counts = [] range_ids = [] for id in range(5000, 5501): if id in id_to_count: range_counts.append(id_to_count[id]) range_ids.append(id) if range_counts: range_min = min(range_counts) range_max = max(range_counts) range_mean = sum(range_counts) / len(range_counts) range_sorted = sorted(range_counts) range_n = len(range_counts) range_p10 = range_sorted[int(0.1 * range_n)] if range_n > 0 else 0 range_p50 = range_sorted[int(0.5 * range_n)] if range_n > 0 else 0 range_p90 = range_sorted[int(0.9 * range_n)] if range_n > 0 else 0 print(f"IDs in range 5000-5500: {len(range_counts)}") print(f"Min frequency in range: {range_min}") print(f"Max frequency in range: {range_max}") print(f"Mean frequency in range: {range_mean:.2f}") print(f"10th percentile in range: {range_p10}") print(f"50th percentile in range: {range_p50}") print(f"90th percentile in range: {range_p90}") # Find IDs with min frequency in this range min_in_range_ids = [id for id in range_ids if id_to_count[id] == range_min] print(f"IDs with min frequency in range: {min_in_range_ids[:10]}{'...' if len(min_in_range_ids) > 10 else ''}") else: print("No IDs found in range 5000-5500") # Histogram of frequencies (log bins) print("\n=== FREQUENCY DISTRIBUTION (LOG BINS) ===") if max_count > 0: log_min = math.log10(min_count) if min_count > 0 else 0 log_max = math.log10(max_count) num_bins = 20 bin_edges = [10**(log_min + i*(log_max-log_min)/num_bins) for i in range(num_bins+1)] hist = [0] * num_bins for count in counts: if count > 0: log_val = math.log10(count) bin_idx = min(int((log_val - log_min) / (log_max - log_min) * num_bins), num_bins-1) hist[bin_idx] += 1 print("Log-scale histogram (count range -> frequency count):") for i in range(num_bins): if hist[i] > 0: lower = bin_edges[i] upper = bin_edges[i+1] print(f" {lower:.2e} - {upper:.2e}: {hist[i]} entries") # Check for zero or near-zero frequencies zero_count = sum(1 for c in counts if c == 0) low_count = sum(1 for c in counts if 0 < c <= 10) very_low_count = sum(1 for c in counts if 0 < c <= 100) print(f"\n=== LOW FREQUENCY ANALYSIS ===") print(f"Entries with zero frequency: {zero_count}") print(f"Entries with frequency <= 10: {low_count}") print(f"Entries with frequency <= 100: {very_low_count}") # Find the actual min frequency (excluding zeros if any) non_zero_counts = [c for c in counts if c > 0] if non_zero_counts: actual_min = min(non_zero_counts) print(f"Actual min frequency (non-zero): {actual_min}") actual_min_ids = [id for id, count in id_to_count.items() if count == actual_min] print(f"IDs with actual min frequency: {actual_min_ids[:10]}{'...' if len(actual_min_ids) > 10 else ''}") # Summary for smoothing algorithm design print("\n=== SUMMARY FOR SMOOTHING ALGORITHM DESIGN ===") print(f"Frequency range spans {max_count/min_count if min_count>0 else 'inf'}:1 ratio") print(f"Most entries ({p50}) have frequency around {p50}") print(f"Top 10% of entries have frequency > {p90}") print(f"Bottom 10% of entries have frequency < {p10}") print(f"ID order is {'roughly' if decreases > increases else 'not'} sorted by frequency") # Save detailed data for further analysis output_file = "frequency_analysis_results.txt" with open(output_file, 'w', encoding='utf-8') as f: f.write("Frequency Analysis Results\n") f.write("="*50 + "\n") f.write(f"Min frequency: {min_count}\n") f.write(f"Max frequency: {max_count}\n") f.write(f"Mean frequency: {mean_count:.2f}\n") f.write(f"Standard deviation: {std_dev:.2f}\n") f.write(f"10th percentile: {p10}\n") f.write(f"50th percentile: {p50}\n") f.write(f"90th percentile: {p90}\n") f.write(f"IDs in range 5000-5500 min: {range_min if 'range_min' in locals() else 'N/A'}\n") f.write(f"IDs in range 5000-5500 max: {range_max if 'range_max' in locals() else 'N/A'}\n") print(f"\nDetailed results saved to {output_file}") if __name__ == "__main__": main()