147 lines
5.1 KiB
Python
147 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Visualize frequency distribution with ASCII plots
|
|
"""
|
|
|
|
import json
|
|
import math
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
def ascii_histogram(data, bins=20, width=60):
|
|
"""Create ASCII histogram"""
|
|
if not data:
|
|
return ""
|
|
|
|
min_val = min(data)
|
|
max_val = max(data)
|
|
|
|
# Use log bins for wide range
|
|
if max_val / min_val > 1000:
|
|
log_min = math.log10(min_val) if min_val > 0 else 0
|
|
log_max = math.log10(max_val)
|
|
bin_edges = [10**(log_min + i*(log_max-log_min)/bins) for i in range(bins+1)]
|
|
hist = [0] * bins
|
|
for val in data:
|
|
if val > 0:
|
|
log_val = math.log10(val)
|
|
bin_idx = min(int((log_val - log_min) / (log_max - log_min) * bins), bins-1)
|
|
hist[bin_idx] += 1
|
|
bin_labels = [f"{bin_edges[i]:.1e}-{bin_edges[i+1]:.1e}" for i in range(bins)]
|
|
else:
|
|
bin_width = (max_val - min_val) / bins
|
|
bin_edges = [min_val + i*bin_width for i in range(bins+1)]
|
|
hist = [0] * bins
|
|
for val in data:
|
|
bin_idx = min(int((val - min_val) / (max_val - min_val) * bins), bins-1)
|
|
hist[bin_idx] += 1
|
|
bin_labels = [f"{bin_edges[i]:.1f}-{bin_edges[i+1]:.1f}" for i in range(bins)]
|
|
|
|
max_count = max(hist)
|
|
result = []
|
|
for i in range(bins):
|
|
if hist[i] == 0:
|
|
continue
|
|
bar = '#' * int(hist[i] / max_count * width)
|
|
result.append(f"{bin_labels[i]:20} | {bar} {hist[i]}")
|
|
|
|
return "\n".join(result)
|
|
|
|
def main():
|
|
json_path = Path("src/model/assets/pinyin_char_statistics.json")
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
pairs = data.get('pairs', {})
|
|
counts = [pair.get('count', 0) for pair in pairs.values() if pair.get('count') is not None]
|
|
|
|
print("FREQUENCY DISTRIBUTION ANALYSIS")
|
|
print("="*60)
|
|
|
|
print("\n1. ASCII Histogram (log bins):")
|
|
print(ascii_histogram(counts, bins=20, width=60))
|
|
|
|
# Rank-frequency plot in ASCII
|
|
print("\n2. Rank-Frequency Relationship (Top 50):")
|
|
counts_sorted_desc = sorted(counts, reverse=True)
|
|
max_freq = counts_sorted_desc[0]
|
|
max_rank = 50
|
|
|
|
for rank in range(1, max_rank + 1):
|
|
freq = counts_sorted_desc[rank-1]
|
|
bar_length = int(math.log(freq) / math.log(max_freq) * 40)
|
|
bar = '#' * bar_length
|
|
print(f"Rank {rank:3}: {freq:12} {bar}")
|
|
|
|
# ID vs Frequency plot (sampled)
|
|
print("\n3. ID vs Frequency (sampled every 500 IDs):")
|
|
# Build ID to count mapping
|
|
id_to_count = {}
|
|
for key, pair in pairs.items():
|
|
char_id = pair.get('id')
|
|
count = pair.get('count')
|
|
if char_id is not None and count is not None:
|
|
id_to_count[char_id] = count
|
|
|
|
all_ids = sorted(id_to_count.keys())
|
|
max_id = all_ids[-1]
|
|
|
|
print("ID Frequency log10(freq)")
|
|
for id in range(0, max_id + 1, 500):
|
|
if id in id_to_count:
|
|
freq = id_to_count[id]
|
|
log_freq = math.log10(freq) if freq > 0 else 0
|
|
bar = '#' * int(log_freq / math.log10(max_freq) * 40)
|
|
print(f"{id:6} {freq:10} {log_freq:6.2f} {bar}")
|
|
|
|
# Zipf's law fit
|
|
print("\n4. Zipf's Law Analysis:")
|
|
print(" Rank * Frequency ≈ constant for Zipf's law")
|
|
print(" Top 10 ranks:")
|
|
for rank in range(1, 11):
|
|
freq = counts_sorted_desc[rank-1]
|
|
product = rank * freq
|
|
print(f" Rank {rank}: {freq:12} rank*freq = {product:.3e}")
|
|
|
|
# Check if product is roughly constant
|
|
products = [(rank+1) * counts_sorted_desc[rank] for rank in range(10)]
|
|
avg_product = sum(products) / len(products)
|
|
std_product = math.sqrt(sum((p - avg_product)**2 for p in products) / len(products))
|
|
print(f" Average product (ranks 2-11): {avg_product:.3e} ± {std_product:.3e}")
|
|
print(f" Coefficient of variation: {std_product/avg_product*100:.1f}%")
|
|
|
|
# Frequency spectrum
|
|
from collections import Counter
|
|
freq_counter = Counter(counts)
|
|
print("\n5. Frequency Spectrum (how many entries have each frequency):")
|
|
print(" Frequency Count Cumulative")
|
|
cum = 0
|
|
for freq in sorted(freq_counter.keys())[:20]:
|
|
count = freq_counter[freq]
|
|
cum += count
|
|
print(f" {freq:10} {count:6} {cum:6}")
|
|
|
|
# Summary statistics
|
|
print("\n6. Key Statistics:")
|
|
n = len(counts)
|
|
print(f" Total entries: {n}")
|
|
print(f" Min frequency: {min(counts)}")
|
|
print(f" Max frequency: {max(counts)}")
|
|
print(f" Ratio max/min: {max(counts)/min(counts):.2e}")
|
|
|
|
percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
|
|
for p in percentiles:
|
|
idx = int(p * n)
|
|
value = counts_sorted_desc[idx]
|
|
print(f" {p*100:5.1f}th percentile: {value:12} (rank ~{idx})")
|
|
|
|
# Save data for external plotting
|
|
with open("id_vs_freq.csv", "w") as f:
|
|
f.write("id,frequency\n")
|
|
for id in sorted(id_to_count.keys()):
|
|
f.write(f"{id},{id_to_count[id]}\n")
|
|
print("\nData saved to id_vs_freq.csv for external plotting")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|