#! /usr/bin/env python # def english_sentence_length_cdf ( x ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_CDF evaluates the English Sentence Length CDF. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # # Reference: # # Henry Kucera, Winthrop Francis, # Computational Analysis of Present-Day American English, # Brown University Press, 1967. # # Parameters: # # Input, integer X, the word length whose CDF is desired. # # Output, real CDF, the value of the CDF. # import numpy as np from r8vec_sum import r8vec_sum word_length_max = 79 pdf_vec = np.array ( [ \ 0.00806, \ 0.01370, \ 0.01862, \ 0.02547, \ 0.03043, \ 0.03189, \ 0.03516, \ 0.03545, \ 0.03286, \ 0.03533, \ 0.03562, \ 0.03788, \ 0.03669, \ 0.03751, \ 0.03518, \ 0.03541, \ 0.03434, \ 0.03305, \ 0.03329, \ 0.03103, \ 0.02867, \ 0.02724, \ 0.02647, \ 0.02526, \ 0.02086, \ 0.02178, \ 0.02128, \ 0.01801, \ 0.01690, \ 0.01556, \ 0.01512, \ 0.01326, \ 0.01277, \ 0.01062, \ 0.01051, \ 0.00901, \ 0.00838, \ 0.00764, \ 0.00683, \ 0.00589, \ 0.00624, \ 0.00488, \ 0.00477, \ 0.00406, \ 0.00390, \ 0.00350, \ 0.00318, \ 0.00241, \ 0.00224, \ 0.00220, \ 0.00262, \ 0.00207, \ 0.00174, \ 0.00174, \ 0.00128, \ 0.00121, \ 0.00103, \ 0.00117, \ 0.00124, \ 0.00082, \ 0.00088, \ 0.00061, \ 0.00061, \ 0.00075, \ 0.00063, \ 0.00056, \ 0.00052, \ 0.00057, \ 0.00031, \ 0.00029, \ 0.00021, \ 0.00017, \ 0.00021, \ 0.00034, \ 0.00031, \ 0.00011, \ 0.00011, \ 0.00008, \ 0.00006 ] ) pdf_sum = 0.99768 if ( x < 1 ): cdf = 0.0 elif ( x < word_length_max ): cdf = r8vec_sum ( x, pdf_vec ) / pdf_sum elif ( word_length_max <= x ): cdf = 1.0 return cdf def english_sentence_length_cdf_inv ( cdf ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_CDF_INV inverts the English Sentence Length CDF. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # # Reference: # # Henry Kucera, Winthrop Francis, # Computational Analysis of Present-Day American English, # Brown University Press, 1967. # # Parameters: # # Input, real CDF, the value of the CDF. # 0.0 <= CDF <= 1.0. # # Output, integer X, the corresponding word length for which # CDF(X-1) < CDF <= CDF(X) # import numpy as np from sys import exit word_length_max = 79 pdf_vec = np.array ( [ \ 0.00806, \ 0.01370, \ 0.01862, \ 0.02547, \ 0.03043, \ 0.03189, \ 0.03516, \ 0.03545, \ 0.03286, \ 0.03533, \ 0.03562, \ 0.03788, \ 0.03669, \ 0.03751, \ 0.03518, \ 0.03541, \ 0.03434, \ 0.03305, \ 0.03329, \ 0.03103, \ 0.02867, \ 0.02724, \ 0.02647, \ 0.02526, \ 0.02086, \ 0.02178, \ 0.02128, \ 0.01801, \ 0.01690, \ 0.01556, \ 0.01512, \ 0.01326, \ 0.01277, \ 0.01062, \ 0.01051, \ 0.00901, \ 0.00838, \ 0.00764, \ 0.00683, \ 0.00589, \ 0.00624, \ 0.00488, \ 0.00477, \ 0.00406, \ 0.00390, \ 0.00350, \ 0.00318, \ 0.00241, \ 0.00224, \ 0.00220, \ 0.00262, \ 0.00207, \ 0.00174, \ 0.00174, \ 0.00128, \ 0.00121, \ 0.00103, \ 0.00117, \ 0.00124, \ 0.00082, \ 0.00088, \ 0.00061, \ 0.00061, \ 0.00075, \ 0.00063, \ 0.00056, \ 0.00052, \ 0.00057, \ 0.00031, \ 0.00029, \ 0.00021, \ 0.00017, \ 0.00021, \ 0.00034, \ 0.00031, \ 0.00011, \ 0.00011, \ 0.00008, \ 0.00006 ] ) pdf_sum = 0.99768 if ( cdf < 0.0 or 1.0 < cdf ): print ( '' ) print ( 'ENGLISH_WORD_LENGTH_CDF_INV - Fatal error!' ) print ( ' CDF < 0 or 1 < CDF.' ) exit ( 'ENGLISH_WORD_LENGTH_CDF_INV - Fatal error!' ) cum = 0.0 for j in range ( 0, word_length_max ): cum = cum + pdf_vec[j] if ( cdf <= cum / pdf_sum ): x = j + 1 return x x = word_length_max return x def english_sentence_length_cdf_test ( ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_CDF_TEST tests ENGLISH_SENTENCE_LENGTH_CDF. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # import platform seed = 123456789 print ( '' ) print ( 'ENGLISH_SENTENCE_LENGTH_CDF_TEST' ) print ( ' Python version: %s' % ( platform.python_version ( ) ) ) print ( ' ENGLISH_SENTENCE_LENGTH_CDF evaluates the English Sentence Length CDF' ) print ( ' ENGLISH_SENTENCE_LENGTH_CDF_INV inverts the English Sentence Length CDF.' ) print ( ' ENGLISH_SENTENCE_LENGTH_PDF evaluates the English Sentence Length PDF' ) print ( '' ) print ( ' X PDF CDF CDF_INV' ) print ( '' ) for i in range ( 0, 10 ): x, seed = english_sentence_length_sample ( seed ) pdf = english_sentence_length_pdf ( x ) cdf = english_sentence_length_cdf ( x ) x2 = english_sentence_length_cdf_inv ( cdf ) print ( ' %12d %12g %12g %12d' % ( x, pdf, cdf, x2 ) ) # # Terminate. # print ( '' ) print ( 'ENGLISH_SENTENCE_LENGTH_CDF_TEST' ) print ( ' Normal end of execution.' ) return def english_sentence_length_mean ( ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_MEAN evaluates the mean of the English Sentence Length PDF. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # # Reference: # # Henry Kucera, Winthrop Francis, # Computational Analysis of Present-Day American English, # Brown University Press, 1967. # # Parameters: # # Output, real MEAN, the mean of the PDF. # import numpy as np word_length_max = 79 pdf_vec = np.array ( [ \ 0.00806, \ 0.01370, \ 0.01862, \ 0.02547, \ 0.03043, \ 0.03189, \ 0.03516, \ 0.03545, \ 0.03286, \ 0.03533, \ 0.03562, \ 0.03788, \ 0.03669, \ 0.03751, \ 0.03518, \ 0.03541, \ 0.03434, \ 0.03305, \ 0.03329, \ 0.03103, \ 0.02867, \ 0.02724, \ 0.02647, \ 0.02526, \ 0.02086, \ 0.02178, \ 0.02128, \ 0.01801, \ 0.01690, \ 0.01556, \ 0.01512, \ 0.01326, \ 0.01277, \ 0.01062, \ 0.01051, \ 0.00901, \ 0.00838, \ 0.00764, \ 0.00683, \ 0.00589, \ 0.00624, \ 0.00488, \ 0.00477, \ 0.00406, \ 0.00390, \ 0.00350, \ 0.00318, \ 0.00241, \ 0.00224, \ 0.00220, \ 0.00262, \ 0.00207, \ 0.00174, \ 0.00174, \ 0.00128, \ 0.00121, \ 0.00103, \ 0.00117, \ 0.00124, \ 0.00082, \ 0.00088, \ 0.00061, \ 0.00061, \ 0.00075, \ 0.00063, \ 0.00056, \ 0.00052, \ 0.00057, \ 0.00031, \ 0.00029, \ 0.00021, \ 0.00017, \ 0.00021, \ 0.00034, \ 0.00031, \ 0.00011, \ 0.00011, \ 0.00008, \ 0.00006 ] ) pdf_sum = 0.99768 mean = 0.0 for j in range ( 0, word_length_max ): mean = mean + float ( j + 1 ) * pdf_vec[j] mean = mean / pdf_sum return mean def english_sentence_length_pdf ( x ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_PDF evaluates the English Sentence Length PDF. # # Discussion: # # PDF(A,BX) = B(X) if 1 <= X <= A # = 0 otherwise # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # # Reference: # # Henry Kucera, Winthrop Francis, # Computational Analysis of Present-Day American English, # Brown University Press, 1967. # # Parameters: # # Input, integer X, the word length whose probability is desired. # # Output, real PDF, the value of the PDF. # import numpy as np word_length_max = 79 pdf_vec = np.array ( [ \ 0.00806, \ 0.01370, \ 0.01862, \ 0.02547, \ 0.03043, \ 0.03189, \ 0.03516, \ 0.03545, \ 0.03286, \ 0.03533, \ 0.03562, \ 0.03788, \ 0.03669, \ 0.03751, \ 0.03518, \ 0.03541, \ 0.03434, \ 0.03305, \ 0.03329, \ 0.03103, \ 0.02867, \ 0.02724, \ 0.02647, \ 0.02526, \ 0.02086, \ 0.02178, \ 0.02128, \ 0.01801, \ 0.01690, \ 0.01556, \ 0.01512, \ 0.01326, \ 0.01277, \ 0.01062, \ 0.01051, \ 0.00901, \ 0.00838, \ 0.00764, \ 0.00683, \ 0.00589, \ 0.00624, \ 0.00488, \ 0.00477, \ 0.00406, \ 0.00390, \ 0.00350, \ 0.00318, \ 0.00241, \ 0.00224, \ 0.00220, \ 0.00262, \ 0.00207, \ 0.00174, \ 0.00174, \ 0.00128, \ 0.00121, \ 0.00103, \ 0.00117, \ 0.00124, \ 0.00082, \ 0.00088, \ 0.00061, \ 0.00061, \ 0.00075, \ 0.00063, \ 0.00056, \ 0.00052, \ 0.00057, \ 0.00031, \ 0.00029, \ 0.00021, \ 0.00017, \ 0.00021, \ 0.00034, \ 0.00031, \ 0.00011, \ 0.00011, \ 0.00008, \ 0.00006 ] ) pdf_sum = 0.99768 if ( 1 <= x and x <= word_length_max ): pdf = pdf_vec[x-1] / pdf_sum else: pdf = 0.0 return pdf def english_sentence_length_sample ( seed ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_SAMPLE samples the English Sentence Length PDF. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # # Reference: # # Henry Kucera, Winthrop Francis, # Computational Analysis of Present-Day American English, # Brown University Press, 1967. # # Parameters: # # Input/output, integer SEED, a seed for the random number generator. # # Output, integer X, a sample of the PDF. # from r8_uniform_01 import r8_uniform_01 cdf, seed = r8_uniform_01 ( seed ) x = english_sentence_length_cdf_inv ( cdf ) return x, seed def english_sentence_length_sample_test ( ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_SAMPLE_TEST tests ENGLISH_SENTENCE_LENGTH_SAMPLE. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # import numpy as np import platform from i4vec_max import i4vec_max from i4vec_mean import i4vec_mean from i4vec_min import i4vec_min from i4vec_variance import i4vec_variance sample_num = 1000 seed = 123456789 print ( '' ) print ( 'ENGLISH_SENTENCE_LENGTH_SAMPLE_TEST' ) print ( ' Python version: %s' % ( platform.python_version ( ) ) ) print ( ' ENGLISH_SENTENCE_LENGTH_MEAN computes the English Sentence Length mean' ) print ( ' ENGLISH_SENTENCE_LENGTH_SAMPLE samples the English Sentence Length distribution' ) print ( ' ENGLISH_SENTENCE_LENGTH_VARIANCE computes the English Sentence Length variance.' ) mean = english_sentence_length_mean ( ) variance = english_sentence_length_variance ( ) print ( '' ) print ( ' PDF mean = %14g' % ( mean ) ) print ( ' PDF variance = %14g' % ( variance ) ) x = np.zeros ( sample_num ) for i in range ( 0, sample_num ): x[i], seed = english_sentence_length_sample ( seed ) mean = i4vec_mean ( sample_num, x ) variance = i4vec_variance ( sample_num, x ) xmax = i4vec_max ( sample_num, x ) xmin = i4vec_min ( sample_num, x ) print ( '' ) print ( ' Sample size = %12d' % ( sample_num ) ) print ( ' Sample mean = %14g' % ( mean ) ) print ( ' Sample variance = %14g' % ( variance ) ) print ( ' Sample maximum = %14g' % ( xmax ) ) print ( ' Sample minimum = %14g' % ( xmin ) ) # # Terminate. # print ( '' ) print ( 'ENGLISH_SENTENCE_LENGTH_SAMPLE_TEST' ) print ( ' Normal end of execution.' ) return def english_sentence_length_variance ( ): #*****************************************************************************80 # ## ENGLISH_SENTENCE_LENGTH_VARIANCE: variance of the English Sentence Length PDF. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 05 April 2016 # # Author: # # John Burkardt # # Reference: # # Henry Kucera, Winthrop Francis, # Computational Analysis of Present-Day American English, # Brown University Press, 1967. # # Parameters: # # Output, real VARIANCE, the variance of the PDF. # import numpy as np word_length_max = 79 pdf_vec = np.array ( [ \ 0.00806, \ 0.01370, \ 0.01862, \ 0.02547, \ 0.03043, \ 0.03189, \ 0.03516, \ 0.03545, \ 0.03286, \ 0.03533, \ 0.03562, \ 0.03788, \ 0.03669, \ 0.03751, \ 0.03518, \ 0.03541, \ 0.03434, \ 0.03305, \ 0.03329, \ 0.03103, \ 0.02867, \ 0.02724, \ 0.02647, \ 0.02526, \ 0.02086, \ 0.02178, \ 0.02128, \ 0.01801, \ 0.01690, \ 0.01556, \ 0.01512, \ 0.01326, \ 0.01277, \ 0.01062, \ 0.01051, \ 0.00901, \ 0.00838, \ 0.00764, \ 0.00683, \ 0.00589, \ 0.00624, \ 0.00488, \ 0.00477, \ 0.00406, \ 0.00390, \ 0.00350, \ 0.00318, \ 0.00241, \ 0.00224, \ 0.00220, \ 0.00262, \ 0.00207, \ 0.00174, \ 0.00174, \ 0.00128, \ 0.00121, \ 0.00103, \ 0.00117, \ 0.00124, \ 0.00082, \ 0.00088, \ 0.00061, \ 0.00061, \ 0.00075, \ 0.00063, \ 0.00056, \ 0.00052, \ 0.00057, \ 0.00031, \ 0.00029, \ 0.00021, \ 0.00017, \ 0.00021, \ 0.00034, \ 0.00031, \ 0.00011, \ 0.00011, \ 0.00008, \ 0.00006 ] ) pdf_sum = 0.99768 mean = 0.0 for j in range ( 0, word_length_max ): mean = mean + ( j + 1 ) * pdf_vec[j] mean = mean / pdf_sum variance = 0.0 for j in range ( 0, word_length_max ): variance = variance + pdf_vec[j] * ( j + 1 - mean ) ** 2 variance = variance / pdf_sum return variance if ( __name__ == '__main__' ): from timestamp import timestamp timestamp ( ) english_sentence_length_cdf_test ( ) english_sentence_length_sample_test ( ) timestamp ( )