English: Verification of Heaps' law on War and Peace.
```python
import nltk
import urllib.request
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
- Download the corpus
url = "http://www.gutenberg.org/files/2600/2600-0.txt"
response = urllib.request.urlopen(url)
long_txt = response.read().decode('utf8')
import random
- Tokenize the text
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(long_txt.lower())
tokens = tokens[940:]
- Prepare arrays to hold the counts of total words and unique words
total_words = np.arange(1, len(tokens) + 1)
unique_words = np.zeros(len(tokens))
- Count unique words while progressing through the text
word_set = set()
for i, token in enumerate(tokens):
word_set.add(token)
unique_words[i] = len(word_set)
- Fit Heap's law: unique_words = K * total_words ^ beta
log_total_words = np.log(total_words)
log_unique_words = np.log(unique_words)
beta, logK = np.polyfit(log_total_words, log_unique_words, 1)
K = np.exp(logK)
- Print the estimated parameters
print('K:', K)
print('beta:', beta)
- Plot total words vs. unique words
plt.figure(figsize=(8, 6))
plt.plot(total_words, unique_words, label='Empirical Data')
plt.plot(total_words, K * total_words ** beta, '--', label=f'Heaps\' Law Fit: K={K:.2f}, beta={beta:.2f}')
- Tokenize the text
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(long_txt.lower())
tokens = tokens[940:]
random.shuffle(tokens)
- Prepare arrays to hold the counts of total words and unique words
total_words = np.arange(1, len(tokens) + 1)
unique_words = np.zeros(len(tokens))
- Count unique words while progressing through the text
word_set = set()
for i, token in enumerate(tokens):
word_set.add(token)
unique_words[i] = len(word_set)
- Fit Heap's law: unique_words = K * total_words ^ beta
log_total_words = np.log(total_words)
log_unique_words = np.log(unique_words)
beta, logK = np.polyfit(log_total_words, log_unique_words, 1)
K = np.exp(logK)
- Print the estimated parameters
print('K:', K)
print('beta:', beta)
- Plot total words vs. unique words
plt.plot(total_words, unique_words, label='Shuffled Empirical Data')
plt.plot(total_words, K * total_words ** beta, '--', label=f'Heaps\' Law Fit for shuffled data: K={K:.2f}, beta={beta:.2f}')
plt.xlabel('Total Words')
plt.ylabel('Unique Words')
plt.legend()
plt.grid(True)
plt.title('Verification of Heaps\' Law on "War and Peace"')
plt.savefig("war and peace.svg", bbox_inches='tight', format='svg')
```