#! /usr/bin/env python
#
def english_word_length_cdf ( x ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_CDF evaluates the English Word Length CDF.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
#  Reference:
#
#    Henry Kucera, Winthrop Francis,
#    Computational Analysis of Present-Day American English,
#    Brown University Press, 1967.
#
#  Parameters:
#
#    Input, integer X, the word length whose CDF is desired.
#
#    Output, real CDF, the value of the CDF.
#
  import numpy as np
  from i4vec_sum import i4vec_sum

  word_length_max = 27

  pdf_vec = np.array ( [ \
    0.03160, \
    0.16975, \
    0.21192, \
    0.15678, \
    0.10852, \
    0.08524, \
    0.07724, \
    0.05623, \
    0.04032, \
    0.02766, \
    0.01582, \
    0.00917, \
    0.00483, \
    0.00262, \
    0.00099, \
    0.00050, \
    0.00027, \
    0.00022, \
    0.00011, \
    0.00006, \
    0.00005, \
    0.00002, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001 ] )

  pdf_sum = 0.99997

  if ( x < 1 ):
    cdf = 0.0
  elif ( x < word_length_max ):
    cdf = i4vec_sum ( x, pdf_vec ) / pdf_sum
  elif ( word_length_max <= x ):
    cdf = 1.0

  return cdf

def english_word_length_cdf_inv ( cdf ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_CDF_INV inverts the English Word Length CDF.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
#  Reference:
#
#    Henry Kucera, Winthrop Francis,
#    Computational Analysis of Present-Day American English,
#    Brown University Press, 1967.
#
#  Parameters:
#
#    Input, real CDF, the value of the CDF.
#    0.0 <= CDF <= 1.0.
#
#    Output, integer X, the corresponding word length for which
#    CDF(X-1) < CDF <= CDF(X)
#
  import numpy as np
  from sys import exit

  word_length_max = 27

  pdf_vec = np.array ( [ \
    0.03160, \
    0.16975, \
    0.21192, \
    0.15678, \
    0.10852, \
    0.08524, \
    0.07724, \
    0.05623, \
    0.04032, \
    0.02766, \
    0.01582, \
    0.00917, \
    0.00483, \
    0.00262, \
    0.00099, \
    0.00050, \
    0.00027, \
    0.00022, \
    0.00011, \
    0.00006, \
    0.00005, \
    0.00002, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001 ] )

  pdf_sum = 0.99997

  if ( cdf < 0.0 or 1.0 < cdf ):
    print ( '' )
    print ( 'ENGLISH_WORD_LENGTH_CDF_INV - Fatal error!' )
    print ( '  CDF < 0 or 1 < CDF.' )
    exit ( 'ENGLISH_WORD_LENGTH_CDF_INV - Fatal error!' )

  cum = 0.0

  for j in range ( 0, word_length_max ):

    cum = cum + pdf_vec[j]

    if ( cdf <= cum / pdf_sum ):
      x = j + 1
      return x

  x = word_length_max
  
  return x

def english_word_length_cdf_test ( ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_CDF_TEST tests ENGLISH_WORD_LENGTH_CDF.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
  import platform

  seed = 123456789

  print ( '' )
  print ( 'ENGLISH_WORD_LENGTH_CDF_TEST' )
  print ( '  Python version: %s' % ( platform.python_version ( ) ) )
  print ( '  ENGLISH_WORD_LENGTH_CDF evaluates the English Word Length CDF' )
  print ( '  ENGLISH_WORD_LENGTH_CDF_INV inverts the English Word Length CDF.' )
  print ( '  ENGLISH_WORD_LENGTH_PDF evaluates the English Word Length PDF' )

  print ( '' )
  print ( '       X            PDF           CDF            CDF_INV' )
  print ( '' )

  for i in range ( 0, 10 ):

    x, seed = english_word_length_sample ( seed )

    pdf = english_word_length_pdf ( x )

    cdf = english_word_length_cdf ( x )

    x2 = english_word_length_cdf_inv ( cdf )

    print ( '  %12d  %12g  %12g  %12d' % ( x, pdf, cdf, x2 ) )
#
#  Terminate.
#
  print ( '' )
  print ( 'ENGLISH_WORD_LENGTH_CDF_TEST' )
  print ( '  Normal end of execution.' )
  return

def english_word_length_mean ( ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_MEAN evaluates the mean of the English Word Length PDF.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
#  Reference:
#
#    Henry Kucera, Winthrop Francis,
#    Computational Analysis of Present-Day American English,
#    Brown University Press, 1967.
#
#  Parameters:
#
#    Output, real MEAN, the mean of the PDF.
#
  import numpy as np

  word_length_max = 27

  pdf_vec = np.array ( [ \
    0.03160, \
    0.16975, \
    0.21192, \
    0.15678, \
    0.10852, \
    0.08524, \
    0.07724, \
    0.05623, \
    0.04032, \
    0.02766, \
    0.01582, \
    0.00917, \
    0.00483, \
    0.00262, \
    0.00099, \
    0.00050, \
    0.00027, \
    0.00022, \
    0.00011, \
    0.00006, \
    0.00005, \
    0.00002, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001 ] )

  pdf_sum = 0.99997

  mean = 0.0
  for j in range ( 0, word_length_max ):
    mean = mean + ( j + 1 ) * pdf_vec[j]

  mean = mean / pdf_sum

  return mean

def english_word_length_pdf ( x ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_PDF evaluates the English Word Length PDF.
#
#  Discussion:
#
#    PDF(A,BX) = B(X) if 1 <= X <= A
#                = 0    otherwise
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
#  Reference:
#
#    Henry Kucera, Winthrop Francis,
#    Computational Analysis of Present-Day American English,
#    Brown University Press, 1967.
#
#  Parameters:
#
#    Input, integer X, the word length whose probability is desired.
#
#    Output, real PDF, the value of the PDF.
#
  import numpy as np

  word_length_max = 27

  pdf_vec = np.array ( [ \
    0.03160, \
    0.16975, \
    0.21192, \
    0.15678, \
    0.10852, \
    0.08524, \
    0.07724, \
    0.05623, \
    0.04032, \
    0.02766, \
    0.01582, \
    0.00917, \
    0.00483, \
    0.00262, \
    0.00099, \
    0.00050, \
    0.00027, \
    0.00022, \
    0.00011, \
    0.00006, \
    0.00005, \
    0.00002, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001 ] )

  pdf_sum = 0.99997

  if ( 1 <= x and x <= word_length_max ):
    pdf = pdf_vec[x-1] / pdf_sum
  else:
    pdf = 0.0

  return pdf

def english_word_length_sample ( seed ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_SAMPLE samples the English Word Length PDF.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
#  Reference:
#
#    Henry Kucera, Winthrop Francis,
#    Computational Analysis of Present-Day American English,
#    Brown University Press, 1967.
#
#  Parameters:
#
#    Input/output, integer SEED, a seed for the random number generator.
#
#    Output, integer X, a sample of the PDF.
#
  from r8_uniform_01 import r8_uniform_01

  cdf, seed = r8_uniform_01 ( seed )

  x = english_word_length_cdf_inv ( cdf )

  return x, seed

def english_word_length_sample_test ( ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_SAMPLE_TEST tests ENGLISH_WORD_LENGTH_SAMPLE.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
  import numpy as np
  import platform
  from i4vec_max import i4vec_max
  from i4vec_mean import i4vec_mean
  from i4vec_min import i4vec_min
  from i4vec_variance import i4vec_variance

  sample_num = 1000

  seed = 123456789

  print ( '' )
  print ( 'ENGLISH_WORD_LENGTH_SAMPLE_TEST' )
  print ( '  Python version: %s' % ( platform.python_version ( ) ) )
  print ( '  ENGLISH_WORD_LENGTH_MEAN computes the English Word Length mean' )
  print ( '  ENGLISH_WORD_LENGTH_SAMPLE samples the English Word Length distribution' )
  print ( '  ENGLISH_WORD_LENGTH_VARIANCE computes the English Word Length variance.' )

  mean = english_word_length_mean ( )
  variance = english_word_length_variance ( )

  print ( '' )
  print ( '  PDF mean =                    %14g' % ( mean ) )
  print ( '  PDF variance =                %14g' % ( variance ) )

  x = np.zeros ( sample_num )
  for i in range ( 0, sample_num ):
    x[i], seed = english_word_length_sample ( seed )

  mean = i4vec_mean ( sample_num, x )
  variance = i4vec_variance ( sample_num, x )
  xmax = i4vec_max ( sample_num, x )
  xmin = i4vec_min ( sample_num, x )

  print ( '' )
  print ( '  Sample size =     %12d' % ( sample_num ) )
  print ( '  Sample mean =     %14g' % ( mean ) )
  print ( '  Sample variance = %14g' % ( variance ) )
  print ( '  Sample maximum =  %14d' % ( xmax ) )
  print ( '  Sample minimum =  %14d' % ( xmin ) )
#
#  Terminate.
#
  print ( '' )
  print ( 'ENGLISH_WORD_LENGTH_SAMPLE_TEST' )
  print ( '  Normal end of execution.' )
  return

def english_word_length_variance ( ):

#*****************************************************************************80
#
## ENGLISH_WORD_LENGTH_VARIANCE: variance of the English Word Length PDF.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license.
#
#  Modified:
#
#    08 April 2016
#
#  Author:
#
#    John Burkardt
#
#  Reference:
#
#    Henry Kucera, Winthrop Francis,
#    Computational Analysis of Present-Day American English,
#    Brown University Press, 1967.
#
#  Parameters:
#
#    Output, real VARIANCE, the variance of the PDF.
#
  import numpy as np

  word_length_max = 27

  pdf_vec = np.array ( [ \
    0.03160, \
    0.16975, \
    0.21192, \
    0.15678, \
    0.10852, \
    0.08524, \
    0.07724, \
    0.05623, \
    0.04032, \
    0.02766, \
    0.01582, \
    0.00917, \
    0.00483, \
    0.00262, \
    0.00099, \
    0.00050, \
    0.00027, \
    0.00022, \
    0.00011, \
    0.00006, \
    0.00005, \
    0.00002, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001, \
    0.00001 ] )

  pdf_sum = 0.99997

  mean = 0.0
  for j in range ( 0, word_length_max ):
    mean = mean + ( j + 1 ) * pdf_vec[j]

  mean = mean / pdf_sum

  variance = 0.0
  for j in range ( 0, word_length_max ):
    variance = variance + pdf_vec[j] * ( j + 1 - mean ) ** 2 

  variance = variance / pdf_sum

  return variance

if ( __name__ == '__main__' ):
  from timestamp import timestamp
  timestamp ( ) 
  english_word_length_cdf_test ( )
  english_word_length_sample_test ( )
  timestamp ( )