Skip to content
Snippets Groups Projects
Commit 87468754 authored by Steffen Remus's avatar Steffen Remus
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
Showing with 361 additions and 0 deletions
__pycache__
The ``zipf`` package tallies the occurrences of words in text files
and plots each word's rank versus its frequency
together with a line for the theoretical distribution for Zipf's Law.
Motivation
----------
Zipf’s Law is often stated as an observational pattern seen in the
relationship between the frequency and rank of words in a text:
`"…the most frequent word will occur approximately twice as often
as the second most frequent word,
three times as often as the third most
frequent word, etc."`
`wikipedia <https://en.wikipedia.org/wiki/Zipf%27s_law>`_
Many books are available to download in plain text format
from sites such as `Project Gutenberg <https://www.gutenberg.org/>`_,
so we created this package to qualitatively explore how well different books align
with the word frequencies predicted by Zipf's Law.
Installation
------------
``pip install zipf``
Usage
-----
After installing this package,
the following three commands will be available from the command line
- ``countwords`` for counting the occurrences of words in a text.
- ``collate`` for collating multiple word count files together.
- ``plotcounts`` for visualizing the word counts.
A typical usage scenario would include running the following from your terminal::
countwords dracula.txt > dracula.csv
countwords moby_dick.txt > moby_dick.csv
collate dracula.csv moby_dick.csv > collated.csv
plotcounts collated.csv --outfile zipf-drac-moby.jpg
Additional information on each function
can be found in their docstrings and appending the ``-h`` flag,
e.g. ``countwords -h``.
Contributors
------------
- Amira Khan <amira@zipf.org>
- Sami Virtanen <sami@zipf.org>
# Get desired information from a Project Gutenberg eBook.
# Usage: bash book_summary.sh /path/to/file.txt what_to_look_for
head -n 17 $1 | tail -n 8 | grep $2
"""
Combine multiple word count CSV-files
into a single cumulative count.
"""
import csv
import argparse
from collections import Counter
import logging
import utilities as util
def update_counts(reader, word_counts):
"""Update word counts with data from another reader/file."""
for word, count in csv.reader(reader):
word_counts[word] += int(count)
def process_file(fname, word_counts):
"""Read file and update word counts"""
logging.debug(f'Reading in {fname}...')
if fname[-4:] != '.csv':
msg = util.ERRORS['not_csv_suffix'].format(
fname=fname)
raise OSError(msg)
with open(fname, 'r') as reader:
logging.debug('Computing word counts...')
update_counts(reader, word_counts)
def main(args):
"""Run the command line program."""
log_lev = logging.DEBUG if args.verbose else logging.WARNING
logging.basicConfig(level=log_lev, filename=args.logfile)
word_counts = Counter()
logging.info('Processing files...')
for fname in args.infiles:
try:
process_file(fname, word_counts)
except FileNotFoundError:
msg = f'{fname} not processed: File does not exist'
logging.warning(msg)
except PermissionError:
msg = f'{fname} not processed: No read permission'
logging.warning(msg)
except Exception as error:
msg = f'{fname} not processed: {error}'
logging.warning(msg)
util.collection_to_csv(word_counts, num=args.num)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infiles', type=str, nargs='*',
help='Input file names')
parser.add_argument('-n', '--num',
type=int, default=None,
help='Output n most frequent words')
parser.add_argument('-v', '--verbose',
action="store_true", default=False,
help="Set logging level to DEBUG")
parser.add_argument('-l', '--logfile',
type=str, default='collate.log',
help='Name of the log file')
args = parser.parse_args()
main(args)
"""
Count the occurrences of all words in a text
and output them in CSV format.
"""
import argparse
import string
from collections import Counter
import utilities as util
def count_words(reader):
"""Count the occurrence of each word in a string."""
text = reader.read()
chunks = text.split()
npunc = [word.strip(string.punctuation) for word in chunks]
word_list = [word.lower() for word in npunc if word]
word_counts = Counter(word_list)
return word_counts
def main(args):
"""Run the command line program."""
word_counts = count_words(args.infile)
util.collection_to_csv(word_counts, num=args.num)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infile', type=argparse.FileType('r'),
nargs='?', default='-',
help='Input file name')
parser.add_argument('-n', '--num',
type=int, default=None,
help='Output n most frequent words')
args = parser.parse_args()
main(args)
"""Plot word counts."""
import argparse
import yaml
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.optimize import minimize_scalar
def nlog_likelihood(beta, counts):
"""Log-likelihood function."""
likelihood = - np.sum(np.log((1/counts)**(beta - 1)
- (1/(counts + 1))**(beta - 1)))
return likelihood
def get_power_law_params(word_counts):
"""
Get the power law parameters.
References
----------
Moreno-Sanchez et al (2016) define alpha (Eq. 1),
beta (Eq. 2) and the maximum likelihood estimation (mle)
of beta (Eq. 6).
Moreno-Sanchez I, Font-Clos F, Corral A (2016)
Large-Scale Analysis of Zipf's Law in English Texts.
PLoS ONE 11(1): e0147073.
https://doi.org/10.1371/journal.pone.0147073
"""
assert type(word_counts) == np.ndarray, \
'Input must be a numerical (numpy) array of word counts'
mle = minimize_scalar(nlog_likelihood,
bracket=(1 + 1e-10, 4),
args=word_counts,
method='brent')
beta = mle.x
alpha = 1 / (beta - 1)
return alpha
def set_plot_params(param_file):
"""Set the matplotlib parameters."""
if param_file:
with open(param_file, 'r') as reader:
param_dict = yaml.load(reader,
Loader=yaml.BaseLoader)
else:
param_dict = {}
for param, value in param_dict.items():
mpl.rcParams[param] = value
def plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax):
"""
Plot the power law curve that was fitted to the data.
Parameters
----------
curve_xmin : float
Minimum x-bound for fitted curve
curve_xmax : float
Maximum x-bound for fitted curve
max_rank : int
Maximum word frequency rank.
alpha : float
Estimated alpha parameter for the power law.
ax : matplotlib axes
Scatter plot to which the power curve will be added.
"""
xvals = np.arange(curve_xmin, curve_xmax)
yvals = max_rank * (xvals**(-1 / alpha))
ax.loglog(xvals, yvals, color='grey')
def save_configuration(fname, params):
"""Save configuration to a file."""
with open(fname, 'w') as reader:
yaml.dump(params, reader)
def main(args):
"""Run the command line program."""
if args.style:
plt.style.use(args.style)
set_plot_params(args.plotparams)
if args.saveconfig:
save_configuration(args.saveconfig, mpl.rcParams)
df = pd.read_csv(args.infile, header=None,
names=('word', 'word_frequency'))
df['rank'] = df['word_frequency'].rank(ascending=False,
method='max')
ax = df.plot.scatter(x='word_frequency',
y='rank', loglog=True,
figsize=[12, 6],
grid=True,
xlim=args.xlim)
word_counts = df['word_frequency'].to_numpy()
alpha = get_power_law_params(word_counts)
print('alpha:', alpha)
# Since the ranks are already sorted, we can take the last
# one instead of computing which row has the highest rank
max_rank = df['rank'].to_numpy()[-1]
# Use the range of the data as the boundaries
# when drawing the power law curve
curve_xmin = df['word_frequency'].min()
curve_xmax = df['word_frequency'].max()
plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax)
ax.figure.savefig(args.outfile)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infile', type=argparse.FileType('r'),
nargs='?', default='-',
help='Word count csv file name')
parser.add_argument('--outfile', type=str,
default='plotcounts.png',
help='Output image file name')
parser.add_argument('--xlim', type=float, nargs=2,
metavar=('XMIN', 'XMAX'),
default=None, help='X-axis limits')
parser.add_argument('--plotparams', type=str, default=None,
help='matplotlib parameters (YAML file)')
parser.add_argument('--style', type=str, nargs='*',
choices=plt.style.available,
default=None, help='matplotlib style')
parser.add_argument('--saveconfig', type=str, default=None,
help='Save configuration to file')
args = parser.parse_args()
main(args)
"""One-line description of what the script does."""
import argparse
def main(args):
"""Run the program."""
print('Input file:', args.infile)
print('Output file:', args.outfile)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infile', type=str,
help='Input file name')
parser.add_argument('outfile', type=str,
help='Output file name')
args = parser.parse_args()
main(args)
"""Collection of commonly used functions."""
import sys
import csv
ERRORS = {
'not_csv_suffix': '{fname}: File must end in .csv',
}
def collection_to_csv(collection, num=None):
"""
Write collection of items and counts in csv format.
Parameters
----------
collection : collections.Counter
Collection of items and counts
num : int
Limit output to N most frequent items
"""
collection = collection.most_common()
if num is None:
num = len(collection)
writer = csv.writer(sys.stdout)
writer.writerows(collection[0:num])
Full text of novels downloaded from Project Gutenberg.
- What is this?: text files (newline-terminated)
- Source(s): Project Gutenberg, https://www.gutenberg.org/
- License: public domain
- Each file contains the text of its license.
- See https://www.gutenberg.org/wiki/Gutenberg:Permission_How-To for details.
- Downloaded: 2018-08-15
- Contact: Greg Wilson <gvwilson@third-bit.com>
- Spatial Applicability: N/A
- Temporal Applicability: N/A
- Notes:
- Files contain some 8-bit characters (e.g., curly quotation marks).
- Files contain bibliographic information at the start.
- Files contain errata, license information, and other material at the end.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
results/collated.png

32.5 KiB

This diff is collapsed.
results/dracula.png

30.6 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment