#!/usr/bin/env python3 """ Analyze specific ID ranges in pinyin_char_statistics.json """ import json import sys from pathlib import Path def main(): json_path = ( Path(__file__).parent.parent / "src" / "model" / "assets" / "pinyin_char_statistics.json" ) with open(json_path, "r", encoding="utf-8") as f: data = json.load(f) pairs = data.get("pairs", {}) # Build ID to count mapping id_to_count = {} for key, pair in pairs.items(): char_id = pair.get("id") count = pair.get("count") if char_id is not None and count is not None: id_to_count[char_id] = count # Analyze range 5000-5500 in detail print("ID range 5000-5500 detailed analysis:") print("ID\tCount\tChar\tPinyin") range_data = [] for id in range(5000, 5501): if id in id_to_count: # Find the pair to get char and pinyin for key, pair in pairs.items(): if pair.get("id") == id: char = pair.get("char", "") pinyin = pair.get("pinyin", "") count = pair.get("count", 0) range_data.append((id, count, char, pinyin)) if id % 100 == 0: # Print every 100th for overview print(f"{id}\t{count}\t{char}\t{pinyin}") break # Print min and max in range if range_data: min_item = min(range_data, key=lambda x: x[1]) max_item = max(range_data, key=lambda x: x[1]) print( f"\nMin in range: ID {min_item[0]}, count {min_item[1]}, char '{min_item[2]}', pinyin '{min_item[3]}'" ) print( f"Max in range: ID {max_item[0]}, count {max_item[1]}, char '{max_item[2]}', pinyin '{max_item[3]}'" ) # Check if frequencies are monotonic in this range counts = [item[1] for item in range_data] increasing = all(counts[i] <= counts[i + 1] for i in range(len(counts) - 1)) decreasing = all(counts[i] >= counts[i + 1] for i in range(len(counts) - 1)) print(f"Monotonic in range: increasing={increasing}, decreasing={decreasing}") # Check for frequency plateaus from collections import Counter freq_count = Counter(counts) most_common = freq_count.most_common(5) print(f"Most common frequencies in range: {most_common}") # Analyze the tail (IDs with frequency 1) print("\n\nAnalysis of frequency=1 entries:") freq_one_ids = [id for id, count in id_to_count.items() if count == 1] print(f"Number of entries with frequency=1: {len(freq_one_ids)}") if freq_one_ids: print(f"ID range of frequency=1: {min(freq_one_ids)} to {max(freq_one_ids)}") print(f"First 10 IDs: {freq_one_ids[:10]}") print(f"Last 10 IDs: {freq_one_ids[-10:]}") # Check if they're contiguous sorted_ids = sorted(freq_one_ids) contiguous = all( sorted_ids[i] + 1 == sorted_ids[i + 1] for i in range(len(sorted_ids) - 1) ) print(f"Are they contiguous IDs? {contiguous}") # Sample some characters print("\nSample characters with frequency=1:") sample_count = 0 for key, pair in pairs.items(): if pair.get("count") == 1 and sample_count < 10: print( f" ID {pair.get('id')}: char '{pair.get('char')}', pinyin '{pair.get('pinyin')}'" ) sample_count += 1 # Check overall ID-frequency ordering print("\n\nOverall ID-frequency ordering analysis:") all_ids = sorted(id_to_count.keys()) all_counts = [id_to_count[id] for id in all_ids] # Count monotonic segments non_increasing_segments = 0 current_segment_length = 1 for i in range(1, len(all_counts)): if all_counts[i] <= all_counts[i - 1]: current_segment_length += 1 else: if current_segment_length > 1: non_increasing_segments += 1 current_segment_length = 1 if current_segment_length > 1: non_increasing_segments += 1 print(f"Total IDs: {len(all_ids)}") print(f"Non-increasing segments: {non_increasing_segments}") # Check for frequency plateaus overall from collections import Counter overall_freq_count = Counter(all_counts) plateaus = [ (freq, count) for freq, count in overall_freq_count.items() if count > 1 ] plateaus_sorted = sorted(plateaus, key=lambda x: x[1], reverse=True)[:10] print(f"Top 10 frequency plateaus (freq: count of IDs sharing that freq):") for freq, count in plateaus_sorted: print(f" {freq}: {count} IDs") if __name__ == "__main__": main()