#!/usr/bin/env python3 """ Visualize frequency distribution with ASCII plots """ import json import math import sys from pathlib import Path def ascii_histogram(data, bins=20, width=60): """Create ASCII histogram""" if not data: return "" min_val = min(data) max_val = max(data) # Use log bins for wide range if max_val / min_val > 1000: log_min = math.log10(min_val) if min_val > 0 else 0 log_max = math.log10(max_val) bin_edges = [ 10 ** (log_min + i * (log_max - log_min) / bins) for i in range(bins + 1) ] hist = [0] * bins for val in data: if val > 0: log_val = math.log10(val) bin_idx = min( int((log_val - log_min) / (log_max - log_min) * bins), bins - 1 ) hist[bin_idx] += 1 bin_labels = [f"{bin_edges[i]:.1e}-{bin_edges[i + 1]:.1e}" for i in range(bins)] else: bin_width = (max_val - min_val) / bins bin_edges = [min_val + i * bin_width for i in range(bins + 1)] hist = [0] * bins for val in data: bin_idx = min(int((val - min_val) / (max_val - min_val) * bins), bins - 1) hist[bin_idx] += 1 bin_labels = [f"{bin_edges[i]:.1f}-{bin_edges[i + 1]:.1f}" for i in range(bins)] max_count = max(hist) result = [] for i in range(bins): if hist[i] == 0: continue bar = "#" * int(hist[i] / max_count * width) result.append(f"{bin_labels[i]:20} | {bar} {hist[i]}") return "\n".join(result) def main(): json_path = ( Path(__file__).parent.parent / "src" / "model" / "assets" / "pinyin_char_statistics.json" ) with open(json_path, "r", encoding="utf-8") as f: data = json.load(f) pairs = data.get("pairs", {}) counts = [ pair.get("count", 0) for pair in pairs.values() if pair.get("count") is not None ] print("FREQUENCY DISTRIBUTION ANALYSIS") print("=" * 60) print("\n1. ASCII Histogram (log bins):") print(ascii_histogram(counts, bins=20, width=60)) # Rank-frequency plot in ASCII print("\n2. Rank-Frequency Relationship (Top 50):") counts_sorted_desc = sorted(counts, reverse=True) max_freq = counts_sorted_desc[0] max_rank = 50 for rank in range(1, max_rank + 1): freq = counts_sorted_desc[rank - 1] bar_length = int(math.log(freq) / math.log(max_freq) * 40) bar = "#" * bar_length print(f"Rank {rank:3}: {freq:12} {bar}") # ID vs Frequency plot (sampled) print("\n3. ID vs Frequency (sampled every 500 IDs):") # Build ID to count mapping id_to_count = {} for key, pair in pairs.items(): char_id = pair.get("id") count = pair.get("count") if char_id is not None and count is not None: id_to_count[char_id] = count all_ids = sorted(id_to_count.keys()) max_id = all_ids[-1] print("ID Frequency log10(freq)") for id in range(0, max_id + 1, 500): if id in id_to_count: freq = id_to_count[id] log_freq = math.log10(freq) if freq > 0 else 0 bar = "#" * int(log_freq / math.log10(max_freq) * 40) print(f"{id:6} {freq:10} {log_freq:6.2f} {bar}") # Zipf's law fit print("\n4. Zipf's Law Analysis:") print(" Rank * Frequency ≈ constant for Zipf's law") print(" Top 10 ranks:") for rank in range(1, 11): freq = counts_sorted_desc[rank - 1] product = rank * freq print(f" Rank {rank}: {freq:12} rank*freq = {product:.3e}") # Check if product is roughly constant products = [(rank + 1) * counts_sorted_desc[rank] for rank in range(10)] avg_product = sum(products) / len(products) std_product = math.sqrt( sum((p - avg_product) ** 2 for p in products) / len(products) ) print(f" Average product (ranks 2-11): {avg_product:.3e} ± {std_product:.3e}") print(f" Coefficient of variation: {std_product / avg_product * 100:.1f}%") # Frequency spectrum from collections import Counter freq_counter = Counter(counts) print("\n5. Frequency Spectrum (how many entries have each frequency):") print(" Frequency Count Cumulative") cum = 0 for freq in sorted(freq_counter.keys())[:20]: count = freq_counter[freq] cum += count print(f" {freq:10} {count:6} {cum:6}") # Summary statistics print("\n6. Key Statistics:") n = len(counts) print(f" Total entries: {n}") print(f" Min frequency: {min(counts)}") print(f" Max frequency: {max(counts)}") print(f" Ratio max/min: {max(counts) / min(counts):.2e}") percentiles = [0.01, 0.1, 0.5, 0.9, 0.99] for p in percentiles: idx = int(p * n) value = counts_sorted_desc[idx] print(f" {p * 100:5.1f}th percentile: {value:12} (rank ~{idx})") # Save data for external plotting with open("id_vs_freq.csv", "w") as f: f.write("id,frequency\n") for id in sorted(id_to_count.keys()): f.write(f"{id},{id_to_count[id]}\n") print("\nData saved to id_vs_freq.csv for external plotting") if __name__ == "__main__": main()