Add scripts, novels, word counts, and plots

3c5b5eb1 · Boehm, Valentin · 3c5b5eb1 · 3c5b5eb1 · 3c5b5eb1 · 3c5b5eb1
Commit 3c5b5eb1 authored 1 year ago by Boehm, Valentin
--- a/.gitignore
+++ b/.gitignore
+.DS_Store
\ No newline at end of file
--- a/bin/book_summary.sh
+++ b/bin/book_summary.sh
+# Get desired information from a Project Gutenberg eBook.
+# Usage: bash book_summary.sh /path/to/file.txt what_to_look_for
+head -n 17 $1 | tail -n 8 | grep $2
--- a/bin/collate.py
+++ b/bin/collate.py
+"""
+Combine multiple word count CSV-files
+into a single cumulative count.
+"""
+
+import csv
+import argparse
+from collections import Counter
+import logging
+
+import utilities as util
+
+
+def update_counts(reader, word_counts):
+    """Update word counts with data from another reader/file."""
+    for word, count in csv.reader(reader):
+        word_counts[word] += int(count)
+
+
+def process_file(fname, word_counts):
+    """Read file and update word counts"""
+    logging.debug(f'Reading in {fname}...')
+    if fname[-4:] != '.csv':
+        msg = util.ERRORS['not_csv_suffix'].format(
+              fname=fname)
+        raise OSError(msg)
+    with open(fname, 'r') as reader:
+        logging.debug('Computing word counts...')
+        update_counts(reader, word_counts)
+
+
+def main(args):
+    """Run the command line program."""
+    log_lev = logging.DEBUG if args.verbose else logging.WARNING
+    logging.basicConfig(level=log_lev, filename=args.logfile)
+    word_counts = Counter()
+    logging.info('Processing files...')
+    for fname in args.infiles:
+        try:
+            process_file(fname, word_counts)
+        except FileNotFoundError:
+            msg = f'{fname} not processed: File does not exist'
+            logging.warning(msg)
+        except PermissionError:
+            msg = f'{fname} not processed: No read permission'
+            logging.warning(msg)
+        except Exception as error:
+            msg = f'{fname} not processed: {error}'
+            logging.warning(msg)
+    util.collection_to_csv(word_counts, num=args.num)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('infiles', type=str, nargs='*',
+                        help='Input file names')
+    parser.add_argument('-n', '--num',
+                        type=int, default=None,
+                        help='Output n most frequent words')
+    parser.add_argument('-v', '--verbose',
+                        action="store_true", default=False,
+                        help="Set logging level to DEBUG")
+    parser.add_argument('-l', '--logfile',
+                        type=str, default='collate.log',
+                        help='Name of the log file')
+    args = parser.parse_args()
+    main(args)
--- a/bin/countwords.py
+++ b/bin/countwords.py
+"""
+Count the occurrences of all words in a text
+and output them in CSV format.
+"""
+
+import argparse
+import string
+from collections import Counter
+
+import utilities as util
+
+
+def count_words(reader):
+    """Count the occurrence of each word in a string."""
+    text = reader.read()
+    chunks = text.split()
+    npunc = [word.strip(string.punctuation) for word in chunks]
+    word_list = [word.lower() for word in npunc if word]
+    word_counts = Counter(word_list)
+    return word_counts
+
+
+def main(args):
+    """Run the command line program."""
+    word_counts = count_words(args.infile)
+    util.collection_to_csv(word_counts, num=args.num)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('infile', type=argparse.FileType('r'),
+                        nargs='?', default='-',
+                        help='Input file name')
+    parser.add_argument('-n', '--num',
+                        type=int, default=None,
+                        help='Output n most frequent words')
+    args = parser.parse_args()
+    main(args)
--- a/bin/plotcounts.py
+++ b/bin/plotcounts.py
+"""Plot word counts."""
+
+import argparse
+
+import yaml
+import numpy as np
+import pandas as pd
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from scipy.optimize import minimize_scalar
+
+
+def nlog_likelihood(beta, counts):
+    """Log-likelihood function."""
+    likelihood = - np.sum(np.log((1/counts)**(beta - 1)
+                          - (1/(counts + 1))**(beta - 1)))
+    return likelihood
+
+
+def get_power_law_params(word_counts):
+    """
+    Get the power law parameters.
+
+    References
+    ----------
+    Moreno-Sanchez et al (2016) define alpha (Eq. 1),
+      beta (Eq. 2) and the maximum likelihood estimation (mle)
+      of beta (Eq. 6).
+
+    Moreno-Sanchez I, Font-Clos F, Corral A (2016)
+      Large-Scale Analysis of Zipf's Law in English Texts.
+      PLoS ONE 11(1): e0147073.
+      https://doi.org/10.1371/journal.pone.0147073
+    """
+    assert type(word_counts) == np.ndarray, \
+        'Input must be a numerical (numpy) array of word counts'
+    mle = minimize_scalar(nlog_likelihood,
+                          bracket=(1 + 1e-10, 4),
+                          args=word_counts,
+                          method='brent')
+    beta = mle.x
+    alpha = 1 / (beta - 1)
+    return alpha
+
+
+def set_plot_params(param_file):
+    """Set the matplotlib parameters."""
+    if param_file:
+        with open(param_file, 'r') as reader:
+            param_dict = yaml.load(reader,
+                                   Loader=yaml.BaseLoader)
+    else:
+        param_dict = {}
+    for param, value in param_dict.items():
+        mpl.rcParams[param] = value
+
+
+def plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax):
+    """
+    Plot the power law curve that was fitted to the data.
+
+    Parameters
+    ----------
+    curve_xmin : float
+        Minimum x-bound for fitted curve
+    curve_xmax : float
+        Maximum x-bound for fitted curve
+    max_rank : int
+        Maximum word frequency rank.
+    alpha : float
+        Estimated alpha parameter for the power law.
+    ax : matplotlib axes
+        Scatter plot to which the power curve will be added.
+    """
+    xvals = np.arange(curve_xmin, curve_xmax)
+    yvals = max_rank * (xvals**(-1 / alpha))
+    ax.loglog(xvals, yvals, color='grey')
+
+
+def save_configuration(fname, params):
+    """Save configuration to a file."""
+    with open(fname, 'w') as reader:
+        yaml.dump(params, reader)
+
+
+def main(args):
+    """Run the command line program."""
+    if args.style:
+        plt.style.use(args.style)
+    set_plot_params(args.plotparams)
+    if args.saveconfig:
+        save_configuration(args.saveconfig, mpl.rcParams)
+    df = pd.read_csv(args.infile, header=None,
+                     names=('word', 'word_frequency'))
+    df['rank'] = df['word_frequency'].rank(ascending=False,
+                                           method='max')
+    ax = df.plot.scatter(x='word_frequency',
+                         y='rank', loglog=True,
+                         figsize=[12, 6],
+                         grid=True,
+                         xlim=args.xlim)
+
+    word_counts = df['word_frequency'].to_numpy()
+    alpha = get_power_law_params(word_counts)
+    print('alpha:', alpha)
+
+    # Since the ranks are already sorted, we can take the last
+    # one instead of computing which row has the highest rank
+    max_rank = df['rank'].to_numpy()[-1]
+
+    # Use the range of the data as the boundaries
+    # when drawing the power law curve
+    curve_xmin = df['word_frequency'].min()
+    curve_xmax = df['word_frequency'].max()
+
+    plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax)
+    ax.figure.savefig(args.outfile)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('infile', type=argparse.FileType('r'),
+                        nargs='?', default='-',
+                        help='Word count csv file name')
+    parser.add_argument('--outfile', type=str,
+                        default='plotcounts.png',
+                        help='Output image file name')
+    parser.add_argument('--xlim', type=float, nargs=2,
+                        metavar=('XMIN', 'XMAX'),
+                        default=None, help='X-axis limits')
+    parser.add_argument('--plotparams', type=str, default=None,
+                        help='matplotlib parameters (YAML file)')
+    parser.add_argument('--style', type=str, nargs='*',
+                        choices=plt.style.available,
+                        default=None, help='matplotlib style')
+    parser.add_argument('--saveconfig', type=str, default=None,
+                        help='Save configuration to file')
+    args = parser.parse_args()
+    main(args)
--- a/bin/script_template.py
+++ b/bin/script_template.py
+"""One-line description of what the script does."""
+
+import argparse
+
+
+def main(args):
+    """Run the program."""
+    print('Input file:', args.infile)
+    print('Output file:', args.outfile)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('infile', type=str,
+                        help='Input file name')
+    parser.add_argument('outfile', type=str,
+                        help='Output file name')
+    args = parser.parse_args()
+    main(args)
--- a/bin/utilities.py
+++ b/bin/utilities.py
+"""Collection of commonly used functions."""
+
+import sys
+import csv
+
+
+ERRORS = {
+    'not_csv_suffix': '{fname}: File must end in .csv',
+    }
+
+
+def collection_to_csv(collection, num=None):
+    """
+    Write collection of items and counts in csv format.
+
+    Parameters
+    ----------
+    collection : collections.Counter
+        Collection of items and counts
+    num : int
+        Limit output to N most frequent items
+    """
+    collection = collection.most_common()
+    if num is None:
+        num = len(collection)
+    writer = csv.writer(sys.stdout)
+    writer.writerows(collection[0:num])
--- a/data/README.md
+++ b/data/README.md
+Full text of novels downloaded from Project Gutenberg.
+
+-   What is this?: text files (newline-terminated)
+-   Source(s): Project Gutenberg, https://www.gutenberg.org/
+-   License: public domain
+    -   Each file contains the text of its license.
+    -   See https://www.gutenberg.org/wiki/Gutenberg:Permission_How-To for details.
+-   Downloaded: 2018-08-15
+-   Contact: Greg Wilson <gvwilson@third-bit.com>
+-   Spatial Applicability: N/A
+-   Temporal Applicability: N/A
+-   Notes:
+    -   Files contain some 8-bit characters (e.g., curly quotation marks).
+    -   Files contain bibliographic information at the start.
+    -   Files contain errata, license information, and other material at the end.
--- a/data/dracula.txt
+++ b/data/dracula.txt
--- a/data/frankenstein.txt
+++ b/data/frankenstein.txt
--- a/data/jane_eyre.txt
+++ b/data/jane_eyre.txt
--- a/data/moby_dick.txt
+++ b/data/moby_dick.txt
--- a/data/sense_and_sensibility.txt
+++ b/data/sense_and_sensibility.txt
--- a/data/sherlock_holmes.txt
+++ b/data/sherlock_holmes.txt
--- a/data/time_machine.txt
+++ b/data/time_machine.txt
--- a/results/collated.csv
+++ b/results/collated.csv
--- a/results/collated.png
+++ b/results/collated.png
--- a/results/dracula.csv
+++ b/results/dracula.csv
--- a/results/dracula.png
+++ b/results/dracula.png
--- a/results/frankenstein.csv
+++ b/results/frankenstein.csv