Skip to content
Snippets Groups Projects
Select Git revision
  • 30d38827b338642d3519ffd8991643c259f0078c
  • main default protected
2 results

plotcounts.py

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    plotcounts.py 4.52 KiB
    """Plot word counts."""
    
    import argparse
    
    import yaml
    import numpy as np
    import pandas as pd
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    from scipy.optimize import minimize_scalar
    
    
    def nlog_likelihood(beta, counts):
        """Log-likelihood function."""
        likelihood = - np.sum(np.log((1/counts)**(beta - 1)
                              - (1/(counts + 1))**(beta - 1)))
        return likelihood
    
    
    def get_power_law_params(word_counts):
        """
        Get the power law parameters.
    
        References
        ----------
        Moreno-Sanchez et al (2016) define alpha (Eq. 1),
          beta (Eq. 2) and the maximum likelihood estimation (mle)
          of beta (Eq. 6).
    
        Moreno-Sanchez I, Font-Clos F, Corral A (2016)
          Large-Scale Analysis of Zipf's Law in English Texts.
          PLoS ONE 11(1): e0147073.
          https://doi.org/10.1371/journal.pone.0147073
        """
        assert type(word_counts) == np.ndarray, \
            'Input must be a numerical (numpy) array of word counts'
        mle = minimize_scalar(nlog_likelihood,
                              bracket=(1 + 1e-10, 4),
                              args=word_counts,
                              method='brent')
        beta = mle.x
        alpha = 1 / (beta - 1)
        return alpha
    
    
    def set_plot_params(param_file):
        """Set the matplotlib parameters."""
        if param_file:
            with open(param_file, 'r') as reader:
                param_dict = yaml.load(reader,
                                       Loader=yaml.BaseLoader)
        else:
            param_dict = {}
        for param, value in param_dict.items():
            mpl.rcParams[param] = value
    
    
    def plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax):
        """
        Plot the power law curve that was fitted to the data.
    
        Parameters
        ----------
        curve_xmin : float
            Minimum x-bound for fitted curve
        curve_xmax : float
            Maximum x-bound for fitted curve
        max_rank : int
            Maximum word frequency rank.
        alpha : float
            Estimated alpha parameter for the power law.
        ax : matplotlib axes
            Scatter plot to which the power curve will be added.
        """
        xvals = np.arange(curve_xmin, curve_xmax)
        yvals = max_rank * (xvals**(-1 / alpha))
        ax.loglog(xvals, yvals, color='grey')
    
    
    def save_configuration(fname, params):
        """Save configuration to a file."""
        with open(fname, 'w') as reader:
            yaml.dump(params, reader)
    
    
    def main(args):
        """Run the command line program."""
        if args.style:
            plt.style.use(args.style)
        set_plot_params(args.plotparams)
        if args.saveconfig:
            save_configuration(args.saveconfig, mpl.rcParams)
        df = pd.read_csv(args.infile, header=None,
                         names=('word', 'word_frequency'))
        df['rank'] = df['word_frequency'].rank(ascending=False,
                                               method='max')
        ax = df.plot.scatter(x='word_frequency',
                             y='rank', loglog=True,
                             figsize=[12, 6],
                             grid=True,
                             xlim=args.xlim)
    
        word_counts = df['word_frequency'].to_numpy()
        alpha = get_power_law_params(word_counts)
        print('alpha:', alpha)
    
        # Since the ranks are already sorted, we can take the last
        # one instead of computing which row has the highest rank
        max_rank = df['rank'].to_numpy()[-1]
    
        # Use the range of the data as the boundaries
        # when drawing the power law curve
        curve_xmin = df['word_frequency'].min()
        curve_xmax = df['word_frequency'].max()
    
        plot_fit(curve_xmin, curve_xmax, max_rank, alpha, ax)
        ax.figure.savefig(args.outfile)
    
    
    if __name__ == '__main__':
        parser = argparse.ArgumentParser(description=__doc__)
        parser.add_argument('infile', type=argparse.FileType('r'),
                            nargs='?', default='-',
                            help='Word count csv file name')
        parser.add_argument('--outfile', type=str,
                            default='plotcounts.png',
                            help='Output image file name')
        parser.add_argument('--xlim', type=float, nargs=2,
                            metavar=('XMIN', 'XMAX'),
                            default=None, help='X-axis limits')
        parser.add_argument('--plotparams', type=str, default=None,
                            help='matplotlib parameters (YAML file)')
        parser.add_argument('--style', type=str, nargs='*',
                            choices=plt.style.available,
                            default=None, help='matplotlib style')
        parser.add_argument('--saveconfig', type=str, default=None,
                            help='Save configuration to file')
        args = parser.parse_args()
        main(args)