#! /usr/bin/env python # ''' Allows scoring of text using n-gram probabilities 17/07/12 ''' class ngram_score ( object ): #*****************************************************************************80 # ## NGRAM_SCORE is a class for the Ngram scoring program. # # Author: # # Unknown # def __init__( self, ngramfile, sep = ' ' ): ''' load a file containing ngrams and counts, calculate log probabilities ''' import numpy as np self.ngrams = {} fh = open ( ngramfile, 'rt' ) for line in fh: key, count = line.split ( sep ) self.ngrams[key] = int ( count ) self.L = len ( key ) self.N = sum ( self.ngrams.values() ) # # Calculate log probabilities. # for key in self.ngrams.keys(): self.ngrams[key] = np.log10 ( float ( self.ngrams[key]) / self.N ) self.floor = np.log10 ( 0.01 / self.N ) def score ( self, text ): ''' compute the score of text ''' score = 0 ngrams = self.ngrams.__getitem__ for i in range ( len ( text ) - self.L + 1 ): if text[i:i+self.L] in self.ngrams: score += ngrams ( text[i:i+self.L] ) else: score += self.floor return score def ngram_score_test1 ( ): #*****************************************************************************80 # ## NGRAM_SCORE_TEST1 tests text against monogram statistics. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 15 February 2016 # # Author: # # John Burkardt # import platform print ( '' ) print ( 'NGRAM_SCORE_TEST1:' ) print ( ' Python version: %s' % ( platform.python_version ( ) ) ) print ( ' NGRAM_SCORE tests a string or text against English ngram statistics.' ) print ( ' Here we do a test against English monograms.' ) print ( '' ) print ( ' Apparently, you want to remove all nonalphabetic information,' ) print ( ' and uppercase your text. But you may wish to preserve spaces.' ) print ( '' ) fitness = ngram_score ( '../../datasets/ngrams/english_monograms.txt' ) # # Notice that NGRAM_SCORE is affected by the case of the text, # by spaces, and by punctuation. # s = 'HELLOWORLD' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'HELLO WORLD' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'helloworld' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'HELLO, WORLD!' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'Hello, world!' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) # # Read text from a file. # Oddly enough, HELLOWORLD read from a file gives a different # score from HELLOWORLD entered as a string. It seems to have # a terminating character. # f = 'HELLOWORLD.txt' file = open ( f, 'r' ) t = file.read ( ) t = str.upper ( t ) score = fitness.score ( t ) print ( ' %s length = %d, score = %g' % ( f, len ( t ), score ) ) file.close ( ) # # How do we feed it the text in a file? # f = 'desiderata.txt' file = open ( f, 'r' ) t = file.read ( ) t = str.upper ( t ) score = fitness.score ( t ) print ( ' %s length = %d, score = %g' % ( f, len ( t ), score ) ) file.close ( ) # # Terminate. # print ( '' ) print ( 'NGRAM_SCORE_TEST1:' ) print ( ' Normal end of execution.' ) return def ngram_score_test2 ( ): #*****************************************************************************80 # ## NGRAM_SCORE_TEST2 tests text against bigram statistics. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 15 February 2016 # # Author: # # John Burkardt # import platform print ( '' ) print ( 'NGRAM_SCORE_TEST2:' ) print ( ' Python version: %s' % ( platform.python_version ( ) ) ) print ( ' NGRAM_SCORE tests a string or text against English ngram statistics.' ) print ( ' Here we do a test against English bigrams.' ) print ( '' ) print ( ' Apparently, you want to remove all nonalphabetic information,' ) print ( ' and uppercase your text. But you may wish to preserve spaces.' ) print ( '' ) fitness = ngram_score ( '../../datasets/ngrams/english_bigrams.txt' ) # # Notice that NGRAM_SCORE is affected by the case of the text, # by spaces, and by punctuation. # s = 'HELLOWORLD' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'HELLO WORLD' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'helloworld' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'HELLO, WORLD!' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) s = 'Hello, world!' score = fitness.score ( s ) print ( ' %s length = %d, score = %g' % ( s, len ( s ), score ) ) # # Read text from the file "HELLOWORLD.txt". # Oddly enough, HELLOWORLD read from a file gives a different # score from HELLOWORLD entered as a string. It seems to have # a terminating character. # f = 'HELLOWORLD.txt' file = open ( f, 'r' ) t = file.read ( ) t = str.upper ( t ) score = fitness.score ( t ) print ( ' %s length = %d, score = %g' % ( f, len ( t ), score ) ) file.close ( ) # # Read text from the file "desiderata.txt". # f = 'desiderata.txt' file = open ( f, 'r' ) t = file.read ( ) t = str.upper ( t ) score = fitness.score ( t ) print ( ' %s length = %d, score = %g' % ( f, len ( t ), score ) ) file.close ( ) # # Terminate. # print ( '' ) print ( 'NGRAM_SCORE_TEST2:' ) print ( ' Normal end of execution.' ) return def timestamp ( ): #*****************************************************************************80 # ## TIMESTAMP prints the date as a timestamp. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 06 April 2013 # # Author: # # John Burkardt # # Parameters: # # None # import time t = time.time ( ) print ( time.ctime ( t ) ) return None def timestamp_test ( ): #*****************************************************************************80 # ## TIMESTAMP_TEST tests TIMESTAMP. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 03 December 2014 # # Author: # # John Burkardt # # Parameters: # # None # import platform print ( '' ) print ( 'TIMESTAMP_TEST:' ) print ( ' Python version: %s' % ( platform.python_version ( ) ) ) print ( ' TIMESTAMP prints a timestamp of the current date and time.' ) print ( '' ) timestamp ( ) # # Terminate. # print ( '' ) print ( 'TIMESTAMP_TEST:' ) print ( ' Normal end of execution.' ) return if ( __name__ == '__main__' ): timestamp ( ) ngram_score_test1 ( ) ngram_score_test2 ( ) timestamp ( )