Appendix B — Appendix B: Python Toolkit
B.1 What This Appendix Is
This appendix collects a compact set of reusable Python utilities for the main quantities in the book: surprise, entropy, conditional entropy, KL divergence, mutual information, perplexity, Kraft sums, and Huffman coding.
The code is intentionally lightweight. It uses only the Python standard library so you can paste it into a notebook, script, or utility module without additional dependencies.
The toolkit is not optimized for production use. It is designed to be:
- easy to read
- mathematically faithful
- reusable across exercises and experiments
If you want to turn it into a local module, the simplest approach is to copy the code block below into a file such as info_toolkit.py.
B.2 Core Toolkit
import math
import heapq
from collections import Counter
def normalize(dist: dict) -> dict:
"""
Normalize a dictionary of nonnegative weights into probabilities.
"""
total = sum(dist.values())
if total <= 0:
raise ValueError("distribution must have positive total mass")
return {k: v / total for k, v in dist.items()}
def surprise(p: float, base: float = 2.0) -> float:
"""
Information content of an event with probability p.
"""
if p <= 0 or p > 1:
raise ValueError("p must be in (0, 1]")
return -math.log(p, base)
def entropy_from_probs(probs, base: float = 2.0) -> float:
"""
Shannon entropy from an iterable of probabilities.
"""
return -sum(p * math.log(p, base) for p in probs if p > 0)
def entropy(data, base: float = 2.0) -> float:
"""
Shannon entropy from either:
- a dict mapping outcomes -> weights/probabilities
- an iterable of samples
"""
if isinstance(data, dict):
probs = normalize(data).values()
else:
counts = Counter(data)
probs = normalize(counts).values()
return entropy_from_probs(probs, base=base)
def joint_entropy(pairs, base: float = 2.0) -> float:
"""
Joint entropy H(X, Y) from an iterable of (x, y) samples.
"""
return entropy(list(pairs), base=base)
def conditional_entropy_from_joint(joint_dist: dict,
given_axis: int = 0,
base: float = 2.0) -> float:
"""
Conditional entropy from a joint distribution dict.
joint_dist maps tuples (x, y) -> mass.
given_axis = 0 returns H(Y|X)
given_axis = 1 returns H(X|Y)
"""
joint = normalize(joint_dist)
marginal = Counter()
for (x, y), p_xy in joint.items():
key = x if given_axis == 0 else y
marginal[key] += p_xy
h = 0.0
for (x, y), p_xy in joint.items():
given = x if given_axis == 0 else y
p_given = marginal[given]
if p_xy > 0 and p_given > 0:
h -= p_xy * math.log(p_xy / p_given, base)
return h
def cross_entropy(p: dict, q: dict, base: float = 2.0,
eps: float = 1e-15) -> float:
"""
Cross-entropy H(P, Q) for discrete distributions.
"""
p = normalize(p)
q = normalize(q)
total = 0.0
for x, p_x in p.items():
q_x = max(q.get(x, 0.0), eps)
total -= p_x * math.log(q_x, base)
return total
def kl_divergence(p: dict, q: dict, base: float = 2.0,
eps: float = 1e-15) -> float:
"""
KL(P || Q) for discrete distributions.
"""
p = normalize(p)
q = normalize(q)
total = 0.0
for x, p_x in p.items():
if p_x <= 0:
continue
q_x = q.get(x, 0.0)
if q_x <= 0:
q_x = eps
total += p_x * math.log(p_x / q_x, base)
return total
def mutual_information_from_joint(joint_dist: dict,
base: float = 2.0) -> float:
"""
Mutual information I(X;Y) from a joint distribution.
joint_dist maps (x, y) -> mass.
"""
joint = normalize(joint_dist)
p_x = Counter()
p_y = Counter()
for (x, y), p_xy in joint.items():
p_x[x] += p_xy
p_y[y] += p_xy
mi = 0.0
for (x, y), p_xy in joint.items():
if p_xy <= 0:
continue
mi += p_xy * math.log(p_xy / (p_x[x] * p_y[y]), base)
return mi
def mutual_information_from_samples(xs, ys, base: float = 2.0) -> float:
"""
Mutual information from paired samples.
"""
if len(xs) != len(ys):
raise ValueError("xs and ys must have equal length")
joint = Counter(zip(xs, ys))
return mutual_information_from_joint(joint, base=base)
def binary_entropy(p: float, base: float = 2.0) -> float:
"""
H_b(p) for a Bernoulli(p) random variable.
"""
if p < 0 or p > 1:
raise ValueError("p must be in [0, 1]")
if p == 0 or p == 1:
return 0.0
return -(p * math.log(p, base) + (1 - p) * math.log(1 - p, base))
def perplexity_from_cross_entropy(loss: float, unit: str = "bits") -> float:
"""
Convert cross-entropy loss to perplexity.
unit = 'bits' or 'nats'
"""
if unit == "bits":
return 2 ** loss
if unit == "nats":
return math.exp(loss)
raise ValueError("unit must be 'bits' or 'nats'")
def cross_entropy_from_perplexity(perplexity: float,
unit: str = "bits") -> float:
"""
Convert perplexity back to cross-entropy.
"""
if perplexity <= 0:
raise ValueError("perplexity must be positive")
if unit == "bits":
return math.log2(perplexity)
if unit == "nats":
return math.log(perplexity)
raise ValueError("unit must be 'bits' or 'nats'")
def kraft_sum(code_lengths, alphabet_size: int = 2) -> float:
"""
Compute the Kraft sum ∑ D^{-l_i}.
"""
return sum(alphabet_size ** (-length) for length in code_lengths)
class _HuffmanNode:
def __init__(self, weight, symbol=None, left=None, right=None):
self.weight = weight
self.symbol = symbol
self.left = left
self.right = right
def __lt__(self, other):
return self.weight < other.weight
def build_huffman_codes(freqs: dict) -> dict:
"""
Build a binary Huffman code from a symbol -> weight mapping.
Returns symbol -> bitstring.
"""
freqs = {k: v for k, v in freqs.items() if v > 0}
if not freqs:
return {}
if len(freqs) == 1:
only = next(iter(freqs))
return {only: "0"}
heap = [_HuffmanNode(weight=v, symbol=k) for k, v in freqs.items()]
heapq.heapify(heap)
while len(heap) > 1:
a = heapq.heappop(heap)
b = heapq.heappop(heap)
parent = _HuffmanNode(weight=a.weight + b.weight, left=a, right=b)
heapq.heappush(heap, parent)
root = heap[0]
codes = {}
def walk(node, prefix):
if node.symbol is not None:
codes[node.symbol] = prefix or "0"
return
walk(node.left, prefix + "0")
walk(node.right, prefix + "1")
walk(root, "")
return codes
def expected_code_length(freqs: dict, codes: dict) -> float:
"""
Expected code length under a given codebook.
"""
probs = normalize(freqs)
return sum(probs[sym] * len(code) for sym, code in codes.items())
def encode_symbols(symbols, codes: dict) -> str:
"""
Encode a sequence of symbols to a bitstring.
"""
return "".join(codes[s] for s in symbols)
def decode_huffman(bitstring: str, codes: dict):
"""
Decode a Huffman-coded bitstring.
"""
reverse = {code: sym for sym, code in codes.items()}
out = []
buf = ""
for bit in bitstring:
buf += bit
if buf in reverse:
out.append(reverse[buf])
buf = ""
if buf:
raise ValueError("incomplete codeword at end of bitstring")
return out
def average_surprise(samples, base: float = 2.0) -> float:
"""
Empirical average surprise of observed samples.
Equivalent to empirical entropy under the plug-in estimate.
"""
counts = Counter(samples)
probs = normalize(counts)
return sum(-math.log(probs[x], base) for x in samples) / len(samples)B.3 Minimal Usage Examples
from collections import Counter
# Entropy of a Bernoulli source
print(binary_entropy(0.5)) # 1.0 bit
print(binary_entropy(0.9)) # about 0.469 bits
# KL divergence between two categorical distributions
p = {'A': 0.7, 'B': 0.2, 'C': 0.1}
q = {'A': 0.5, 'B': 0.3, 'C': 0.2}
print(kl_divergence(p, q)) # extra bits per symbol
# Mutual information from paired samples
xs = [0, 0, 0, 1, 1, 1]
ys = [0, 0, 1, 1, 1, 0]
print(mutual_information_from_samples(xs, ys))
# Huffman coding
freqs = Counter("mississippi")
codes = build_huffman_codes(freqs)
encoded = encode_symbols("mississippi", codes)
decoded = "".join(decode_huffman(encoded, codes))
print(codes)
print(expected_code_length(freqs, codes))
print(decoded)B.4 Practical Notes
B.4.1 1. Be explicit about units
Entropy in bits and entropy in nats differ only by a constant factor, but mixing them silently causes confusion. If a library uses natural logarithms, label the result as nats.
B.4.2 2. Empirical estimates are estimates
Functions that compute entropy or mutual information from samples use empirical frequencies. They are consistent, but finite-sample bias is real, especially when the support is large relative to the sample size.
B.4.3 3. KL divergence is unforgiving about zero probability
If Q(x) = 0 while P(x) > 0, the true KL divergence is infinite. In practical code we often smooth with a tiny eps, but that is a numerical approximation, not a mathematical identity.
B.4.4 4. Huffman codes are optimal only within their model class
Huffman coding is optimal among prefix-free symbol-by-symbol codes for a known distribution. It is not the last word in compression. Arithmetic coding, context models, and dictionary methods can beat it on real data because they exploit more structure.
B.5 Suggested Extensions
If you want to extend this toolkit, useful next additions are:
- arithmetic coding intervals
- Blahut-Arimoto for channel capacity
- k-NN mutual information estimators for continuous variables
- CRC implementations
- prequential coding utilities
- PSI and drift monitors for production systems
At that point you are no longer just reading information theory. You are building with it.