直接上代码。
练习目标:
1. 使用 Python 面向对象的方法封装逻辑和表达 ;
2. 使用异常处理和日志API ;
3. 使用文件目录读写API ;
4. 使用 list, map, tuple 三种数据结构 ;
5. lambda 、正则使用及其它。
下一篇将实现并发版本。
#-------------------------------------------------------------------------------
# Name: wordstat_serial.py
# Purpose: statistic words in java files of given directory by serial
#
# Author: qin.shuq
#
# Created: 08/10/2014
# Copyright: (c) qin.shuq 2014
# Licence: <your licence>
#------------------------------------------------------------------------------- import re
import os
import time
import logging LOG_LEVELS = {
'DEBUG': logging.DEBUG, 'INFO': logging.INFO,
'WARN': logging.WARNING, 'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL
} def initlog(filename) : logger = logging.getLogger()
hdlr = logging.FileHandler(filename)
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(LOG_LEVELS['INFO']) return logger errlog = initlog("error.log")
infolog = initlog("info.log") class WordReading(object): def __init__(self, fileList):
self.fileList = fileList def readFileInternal(self, filename):
lines = []
try:
f = open(filename, 'r')
lines = f.readlines()
infolog.info('[successful read file %s]\n' % filename)
f.close()
except IOError, err:
errorInfo = 'file %s Not found \n' % filename
errlog.error(errorInfo)
return lines def readFile(self):
allLines = []
for filename in self.fileList:
allLines.extend(self.readFileInternal(filename))
return allLines class WordAnalyzing(object):
'''
return Map<Word, count> the occurrence times of each word
'''
wordRegex = re.compile("[\w]+")
def __init__(self, allLines):
self.allLines = allLines def analyze(self):
result = {}
lineContent = ''.join(self.allLines)
matches = WordAnalyzing.wordRegex.findall(lineContent)
if matches:
for word in matches:
if result.get(word) is None:
result[word] = 0
result[word] += 1
return result class FileObtainer(object): def __init__(self, dirpath, fileFilterFunc=None):
self.dirpath = dirpath
self.fileFilterFunc = fileFilterFunc def findAllFilesInDir(self):
files = []
for path, dirs, filenames in os.walk(self.dirpath):
if len(filenames) > 0:
for filename in filenames:
files.append(path+'/'+filename) if self.fileFilterFunc is None:
return files
else:
return filter(self.fileFilterFunc, files) class PostProcessing(object): def __init__(self, resultMap):
self.resultMap = resultMap def sortByValue(self):
return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True) def obtainTopN(self, topN):
sortedResult = self.sortByValue()
sortedNum = len(sortedResult)
topN = sortedNum if topN > sortedNum else topN
for i in range(topN):
topi = sortedResult[i]
print topi[0], ' counts: ', topi[1] if __name__ == "__main__": dirpath = "c:\\Users\\qin.shuq\\Desktop\\region_master\\src" starttime = time.time()
fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java'))
fileList = fileObtainer.findAllFilesInDir()
endtime = time.time()
print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time()
wr = WordReading(fileList)
allLines = wr.readFile()
endtime = time.time()
print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time()
wa = WordAnalyzing(allLines)
resultMap = wa.analyze()
endtime = time.time()
print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time()
postproc = PostProcessing(resultMap)
postproc.obtainTopN(30)
endtime = time.time()
print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'