136 lines
4.7 KiB
Python
136 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze specific ID ranges in pinyin_char_statistics.json
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def main():
|
|
json_path = (
|
|
Path(__file__).parent.parent
|
|
/ "src"
|
|
/ "model"
|
|
/ "assets"
|
|
/ "pinyin_char_statistics.json"
|
|
)
|
|
with open(json_path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
pairs = data.get("pairs", {})
|
|
|
|
# Build ID to count mapping
|
|
id_to_count = {}
|
|
for key, pair in pairs.items():
|
|
char_id = pair.get("id")
|
|
count = pair.get("count")
|
|
if char_id is not None and count is not None:
|
|
id_to_count[char_id] = count
|
|
|
|
# Analyze range 5000-5500 in detail
|
|
print("ID range 5000-5500 detailed analysis:")
|
|
print("ID\tCount\tChar\tPinyin")
|
|
|
|
range_data = []
|
|
for id in range(5000, 5501):
|
|
if id in id_to_count:
|
|
# Find the pair to get char and pinyin
|
|
for key, pair in pairs.items():
|
|
if pair.get("id") == id:
|
|
char = pair.get("char", "")
|
|
pinyin = pair.get("pinyin", "")
|
|
count = pair.get("count", 0)
|
|
range_data.append((id, count, char, pinyin))
|
|
if id % 100 == 0: # Print every 100th for overview
|
|
print(f"{id}\t{count}\t{char}\t{pinyin}")
|
|
break
|
|
|
|
# Print min and max in range
|
|
if range_data:
|
|
min_item = min(range_data, key=lambda x: x[1])
|
|
max_item = max(range_data, key=lambda x: x[1])
|
|
print(
|
|
f"\nMin in range: ID {min_item[0]}, count {min_item[1]}, char '{min_item[2]}', pinyin '{min_item[3]}'"
|
|
)
|
|
print(
|
|
f"Max in range: ID {max_item[0]}, count {max_item[1]}, char '{max_item[2]}', pinyin '{max_item[3]}'"
|
|
)
|
|
|
|
# Check if frequencies are monotonic in this range
|
|
counts = [item[1] for item in range_data]
|
|
increasing = all(counts[i] <= counts[i + 1] for i in range(len(counts) - 1))
|
|
decreasing = all(counts[i] >= counts[i + 1] for i in range(len(counts) - 1))
|
|
print(f"Monotonic in range: increasing={increasing}, decreasing={decreasing}")
|
|
|
|
# Check for frequency plateaus
|
|
from collections import Counter
|
|
|
|
freq_count = Counter(counts)
|
|
most_common = freq_count.most_common(5)
|
|
print(f"Most common frequencies in range: {most_common}")
|
|
|
|
# Analyze the tail (IDs with frequency 1)
|
|
print("\n\nAnalysis of frequency=1 entries:")
|
|
freq_one_ids = [id for id, count in id_to_count.items() if count == 1]
|
|
print(f"Number of entries with frequency=1: {len(freq_one_ids)}")
|
|
if freq_one_ids:
|
|
print(f"ID range of frequency=1: {min(freq_one_ids)} to {max(freq_one_ids)}")
|
|
print(f"First 10 IDs: {freq_one_ids[:10]}")
|
|
print(f"Last 10 IDs: {freq_one_ids[-10:]}")
|
|
|
|
# Check if they're contiguous
|
|
sorted_ids = sorted(freq_one_ids)
|
|
contiguous = all(
|
|
sorted_ids[i] + 1 == sorted_ids[i + 1] for i in range(len(sorted_ids) - 1)
|
|
)
|
|
print(f"Are they contiguous IDs? {contiguous}")
|
|
|
|
# Sample some characters
|
|
print("\nSample characters with frequency=1:")
|
|
sample_count = 0
|
|
for key, pair in pairs.items():
|
|
if pair.get("count") == 1 and sample_count < 10:
|
|
print(
|
|
f" ID {pair.get('id')}: char '{pair.get('char')}', pinyin '{pair.get('pinyin')}'"
|
|
)
|
|
sample_count += 1
|
|
|
|
# Check overall ID-frequency ordering
|
|
print("\n\nOverall ID-frequency ordering analysis:")
|
|
all_ids = sorted(id_to_count.keys())
|
|
all_counts = [id_to_count[id] for id in all_ids]
|
|
|
|
# Count monotonic segments
|
|
non_increasing_segments = 0
|
|
current_segment_length = 1
|
|
for i in range(1, len(all_counts)):
|
|
if all_counts[i] <= all_counts[i - 1]:
|
|
current_segment_length += 1
|
|
else:
|
|
if current_segment_length > 1:
|
|
non_increasing_segments += 1
|
|
current_segment_length = 1
|
|
if current_segment_length > 1:
|
|
non_increasing_segments += 1
|
|
|
|
print(f"Total IDs: {len(all_ids)}")
|
|
print(f"Non-increasing segments: {non_increasing_segments}")
|
|
|
|
# Check for frequency plateaus overall
|
|
from collections import Counter
|
|
|
|
overall_freq_count = Counter(all_counts)
|
|
plateaus = [
|
|
(freq, count) for freq, count in overall_freq_count.items() if count > 1
|
|
]
|
|
plateaus_sorted = sorted(plateaus, key=lambda x: x[1], reverse=True)[:10]
|
|
print(f"Top 10 frequency plateaus (freq: count of IDs sharing that freq):")
|
|
for freq, count in plateaus_sorted:
|
|
print(f" {freq}: {count} IDs")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|