SUimeModelTraner/visualize_distribution.py

147 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""
Visualize frequency distribution with ASCII plots
"""
import json
import math
import sys
from pathlib import Path
def ascii_histogram(data, bins=20, width=60):
"""Create ASCII histogram"""
if not data:
return ""
min_val = min(data)
max_val = max(data)
# Use log bins for wide range
if max_val / min_val > 1000:
log_min = math.log10(min_val) if min_val > 0 else 0
log_max = math.log10(max_val)
bin_edges = [10**(log_min + i*(log_max-log_min)/bins) for i in range(bins+1)]
hist = [0] * bins
for val in data:
if val > 0:
log_val = math.log10(val)
bin_idx = min(int((log_val - log_min) / (log_max - log_min) * bins), bins-1)
hist[bin_idx] += 1
bin_labels = [f"{bin_edges[i]:.1e}-{bin_edges[i+1]:.1e}" for i in range(bins)]
else:
bin_width = (max_val - min_val) / bins
bin_edges = [min_val + i*bin_width for i in range(bins+1)]
hist = [0] * bins
for val in data:
bin_idx = min(int((val - min_val) / (max_val - min_val) * bins), bins-1)
hist[bin_idx] += 1
bin_labels = [f"{bin_edges[i]:.1f}-{bin_edges[i+1]:.1f}" for i in range(bins)]
max_count = max(hist)
result = []
for i in range(bins):
if hist[i] == 0:
continue
bar = '#' * int(hist[i] / max_count * width)
result.append(f"{bin_labels[i]:20} | {bar} {hist[i]}")
return "\n".join(result)
def main():
json_path = Path("src/model/assets/pinyin_char_statistics.json")
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
pairs = data.get('pairs', {})
counts = [pair.get('count', 0) for pair in pairs.values() if pair.get('count') is not None]
print("FREQUENCY DISTRIBUTION ANALYSIS")
print("="*60)
print("\n1. ASCII Histogram (log bins):")
print(ascii_histogram(counts, bins=20, width=60))
# Rank-frequency plot in ASCII
print("\n2. Rank-Frequency Relationship (Top 50):")
counts_sorted_desc = sorted(counts, reverse=True)
max_freq = counts_sorted_desc[0]
max_rank = 50
for rank in range(1, max_rank + 1):
freq = counts_sorted_desc[rank-1]
bar_length = int(math.log(freq) / math.log(max_freq) * 40)
bar = '#' * bar_length
print(f"Rank {rank:3}: {freq:12} {bar}")
# ID vs Frequency plot (sampled)
print("\n3. ID vs Frequency (sampled every 500 IDs):")
# Build ID to count mapping
id_to_count = {}
for key, pair in pairs.items():
char_id = pair.get('id')
count = pair.get('count')
if char_id is not None and count is not None:
id_to_count[char_id] = count
all_ids = sorted(id_to_count.keys())
max_id = all_ids[-1]
print("ID Frequency log10(freq)")
for id in range(0, max_id + 1, 500):
if id in id_to_count:
freq = id_to_count[id]
log_freq = math.log10(freq) if freq > 0 else 0
bar = '#' * int(log_freq / math.log10(max_freq) * 40)
print(f"{id:6} {freq:10} {log_freq:6.2f} {bar}")
# Zipf's law fit
print("\n4. Zipf's Law Analysis:")
print(" Rank * Frequency ≈ constant for Zipf's law")
print(" Top 10 ranks:")
for rank in range(1, 11):
freq = counts_sorted_desc[rank-1]
product = rank * freq
print(f" Rank {rank}: {freq:12} rank*freq = {product:.3e}")
# Check if product is roughly constant
products = [(rank+1) * counts_sorted_desc[rank] for rank in range(10)]
avg_product = sum(products) / len(products)
std_product = math.sqrt(sum((p - avg_product)**2 for p in products) / len(products))
print(f" Average product (ranks 2-11): {avg_product:.3e} ± {std_product:.3e}")
print(f" Coefficient of variation: {std_product/avg_product*100:.1f}%")
# Frequency spectrum
from collections import Counter
freq_counter = Counter(counts)
print("\n5. Frequency Spectrum (how many entries have each frequency):")
print(" Frequency Count Cumulative")
cum = 0
for freq in sorted(freq_counter.keys())[:20]:
count = freq_counter[freq]
cum += count
print(f" {freq:10} {count:6} {cum:6}")
# Summary statistics
print("\n6. Key Statistics:")
n = len(counts)
print(f" Total entries: {n}")
print(f" Min frequency: {min(counts)}")
print(f" Max frequency: {max(counts)}")
print(f" Ratio max/min: {max(counts)/min(counts):.2e}")
percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
for p in percentiles:
idx = int(p * n)
value = counts_sorted_desc[idx]
print(f" {p*100:5.1f}th percentile: {value:12} (rank ~{idx})")
# Save data for external plotting
with open("id_vs_freq.csv", "w") as f:
f.write("id,frequency\n")
for id in sorted(id_to_count.keys()):
f.write(f"{id},{id_to_count[id]}\n")
print("\nData saved to id_vs_freq.csv for external plotting")
if __name__ == "__main__":
main()