Skip to content
Snippets Groups Projects
Commit 3c5b5eb1 authored by Boehm, Valentin's avatar Boehm, Valentin
Browse files

Add scripts, novels, word counts, and plots

parents
No related branches found
No related tags found
No related merge requests found
Showing with 310 additions and 0 deletions
.DS_Store
\ No newline at end of file
# Get desired information from a Project Gutenberg eBook.
# Usage: bash book_summary.sh /path/to/file.txt what_to_look_for
head -n 17 $1 | tail -n 8 | grep $2
"""
Combine multiple word count CSV-files
into a single cumulative count.
"""
import csv
import argparse
from collections import Counter
import logging
import utilities as util
def update_counts(reader, word_counts):
"""Update word counts with data from another reader/file."""
for word, count in csv.reader(reader):
word_counts[word] += int(count)
def process_file(fname, word_counts):
"""Read file and update word counts"""
logging.debug(f'Reading in {fname}...')
if fname[-4:] != '.csv':
msg = util.ERRORS['not_csv_suffix'].format(
fname=fname)
raise OSError(msg)
with open(fname, 'r') as reader:
logging.debug('Computing word counts...')
update_counts(reader, word_counts)
def main(args):
"""Run the command line program."""
log_lev = logging.DEBUG if args.verbose else logging.WARNING
logging.basicConfig(level=log_lev, filename=args.logfile)
word_counts = Counter()
logging.info('Processing files...')
for fname in args.infiles:
try:
process_file(fname, word_counts)
except FileNotFoundError:
msg = f'{fname} not processed: File does not exist'
logging.warning(msg)
except PermissionError:
msg = f'{fname} not processed: No read permission'
logging.warning(msg)
except Exception as error:
msg = f'{fname} not processed: {error}'
logging.warning(msg)
util.collection_to_csv(word_counts, num=args.num)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infiles', type=str, nargs='*',
help='Input file names')
parser.add_argument('-n', '--num',
type=int, default=None,
help='Output n most frequent words')
parser.add_argument('-v', '--verbose',
action="store_true", default=False,
help="Set logging level to DEBUG")
parser.add_argument('-l', '--logfile',
type=str, default='collate.log',
help='Name of the log file')
args = parser.parse_args()
main(args)
"""
Count the occurrences of all words in a text
and output them in CSV format.
"""
import argparse
import string
from collections import Counter
import utilities as util
def count_words(reader):
"""Count the occurrence of each word in a string."""
text = reader.read()
chunks = text.split()
npunc = [word.strip(string.punctuation) for word in chunks]
word_list = [word.lower() for word in npunc if word]
word_counts = Counter(word_list)
return word_counts
def main(args):
"""Run the command line program."""
word_counts = count_words(args.infile)
util.collection_to_csv(word_counts, num=args.num)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infile', type=argparse.FileType('r'),
nargs='?', default='-',
help='Input file name')
parser.add_argument('-n', '--num',
type=int, default=None,
help='Output n most frequent words')
args = parser.parse_args()
main(args)
"""Plot word counts."""
import argparse
import yaml
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.optimize import minimize_scalar
def nlog_likelihood(beta, counts):
"""Log-likelihood function."""
likelihood = - np.sum(np.log((1/counts)**(beta - 1)
- (1/(counts + 1))**(beta - 1)))
return likelihood
def get_power_law_params(word_counts):
"""
Get the power law parameters.
References
----------
Moreno-Sanchez et al (2016) define alpha (Eq. 1),
beta (Eq. 2) and the maximum likelihood estimation (mle)
of beta (Eq. 6).
Moreno-Sanchez I, Font-Clos F, Corral A (2016)
Large-Scale Analysis of Zipf's Law in English Texts.
PLoS ONE 11(1): e0147073.
https://doi.org/10.1371/journal.pone.0147073
"""
assert type(word_counts) == np.ndarray, \
'Input must be a numerical (numpy) array of word counts'
mle = minimize_scalar(nlog_likelihood,
bracket=(1 + 1e-10, 4),
args=word_counts,
method='brent')
beta = mle.x
alpha = 1 / (beta - 1)
return alpha
def set_plot_params(param_file):
"""Set the matplotlib parameters."""
if param_file:
with open(param_file, 'r') as reader:
param_dict = yaml.load(reader,
Loader=yaml.BaseLoader)
else:
param_dict = {}
for param, value in param_dict.items():
mpl.rcParams[param] = value
def plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax):
"""
Plot the power law curve that was fitted to the data.
Parameters
----------
curve_xmin : float
Minimum x-bound for fitted curve
curve_xmax : float
Maximum x-bound for fitted curve
max_rank : int
Maximum word frequency rank.
alpha : float
Estimated alpha parameter for the power law.
ax : matplotlib axes
Scatter plot to which the power curve will be added.
"""
xvals = np.arange(curve_xmin, curve_xmax)
yvals = max_rank * (xvals**(-1 / alpha))
ax.loglog(xvals, yvals, color='grey')
def save_configuration(fname, params):
"""Save configuration to a file."""
with open(fname, 'w') as reader:
yaml.dump(params, reader)
def main(args):
"""Run the command line program."""
if args.style:
plt.style.use(args.style)
set_plot_params(args.plotparams)
if args.saveconfig:
save_configuration(args.saveconfig, mpl.rcParams)
df = pd.read_csv(args.infile, header=None,
names=('word', 'word_frequency'))
df['rank'] = df['word_frequency'].rank(ascending=False,
method='max')
ax = df.plot.scatter(x='word_frequency',
y='rank', loglog=True,
figsize=[12, 6],
grid=True,
xlim=args.xlim)
word_counts = df['word_frequency'].to_numpy()
alpha = get_power_law_params(word_counts)
print('alpha:', alpha)
# Since the ranks are already sorted, we can take the last
# one instead of computing which row has the highest rank
max_rank = df['rank'].to_numpy()[-1]
# Use the range of the data as the boundaries
# when drawing the power law curve
curve_xmin = df['word_frequency'].min()
curve_xmax = df['word_frequency'].max()
plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax)
ax.figure.savefig(args.outfile)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infile', type=argparse.FileType('r'),
nargs='?', default='-',
help='Word count csv file name')
parser.add_argument('--outfile', type=str,
default='plotcounts.png',
help='Output image file name')
parser.add_argument('--xlim', type=float, nargs=2,
metavar=('XMIN', 'XMAX'),
default=None, help='X-axis limits')
parser.add_argument('--plotparams', type=str, default=None,
help='matplotlib parameters (YAML file)')
parser.add_argument('--style', type=str, nargs='*',
choices=plt.style.available,
default=None, help='matplotlib style')
parser.add_argument('--saveconfig', type=str, default=None,
help='Save configuration to file')
args = parser.parse_args()
main(args)
"""One-line description of what the script does."""
import argparse
def main(args):
"""Run the program."""
print('Input file:', args.infile)
print('Output file:', args.outfile)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('infile', type=str,
help='Input file name')
parser.add_argument('outfile', type=str,
help='Output file name')
args = parser.parse_args()
main(args)
"""Collection of commonly used functions."""
import sys
import csv
ERRORS = {
'not_csv_suffix': '{fname}: File must end in .csv',
}
def collection_to_csv(collection, num=None):
"""
Write collection of items and counts in csv format.
Parameters
----------
collection : collections.Counter
Collection of items and counts
num : int
Limit output to N most frequent items
"""
collection = collection.most_common()
if num is None:
num = len(collection)
writer = csv.writer(sys.stdout)
writer.writerows(collection[0:num])
Full text of novels downloaded from Project Gutenberg.
- What is this?: text files (newline-terminated)
- Source(s): Project Gutenberg, https://www.gutenberg.org/
- License: public domain
- Each file contains the text of its license.
- See https://www.gutenberg.org/wiki/Gutenberg:Permission_How-To for details.
- Downloaded: 2018-08-15
- Contact: Greg Wilson <gvwilson@third-bit.com>
- Spatial Applicability: N/A
- Temporal Applicability: N/A
- Notes:
- Files contain some 8-bit characters (e.g., curly quotation marks).
- Files contain bibliographic information at the start.
- Files contain errata, license information, and other material at the end.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
results/collated.png

32.5 KiB

This diff is collapsed.
results/dracula.png

30.6 KiB

This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment