python实现指定目录下批量文件的单词计数：串行版本

直接上代码。
练习目标：
1. 使用 Python 面向对象的方法封装逻辑和表达；
2. 使用异常处理和日志API ；
3. 使用文件目录读写API ；
4. 使用 list, map, tuple 三种数据结构；
5. lambda 、正则使用及其它。
下一篇将实现并发版本。
#-------------------------------------------------------------------------------

# Name:        wordstat_serial.py

# Purpose:     statistic words in java files of given directory by serial

#

# Author:      qin.shuq

#

# Created:     08/10/2014

# Copyright:   (c) qin.shuq 2014

# Licence:     <your licence>

#-------------------------------------------------------------------------------

import re

import os

import time

import logging

LOG_LEVELS = {

    'DEBUG': logging.DEBUG, 'INFO': logging.INFO,

    'WARN': logging.WARNING, 'ERROR': logging.ERROR,

    'CRITICAL': logging.CRITICAL

}

def initlog(filename) :

    logger = logging.getLogger()

    hdlr = logging.FileHandler(filename)

    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")

    hdlr.setFormatter(formatter)

    logger.addHandler(hdlr)

    logger.setLevel(LOG_LEVELS['INFO'])

    return logger

errlog = initlog("error.log")

infolog = initlog("info.log")

class WordReading(object):

    def __init__(self, fileList):

        self.fileList = fileList

    def readFileInternal(self, filename):

        lines = []

        try:

            f = open(filename, 'r')

            lines = f.readlines()

            infolog.info('[successful read file %s]\n' % filename)

            f.close()

        except IOError, err:

            errorInfo = 'file %s Not found \n' % filename

            errlog.error(errorInfo)

        return lines

    def readFile(self):

        allLines = []

        for filename in self.fileList:

            allLines.extend(self.readFileInternal(filename))

        return allLines

class WordAnalyzing(object):

    '''

     return Map<Word, count>  the occurrence times of each word

    '''

    wordRegex = re.compile("[\w]+")

    def __init__(self, allLines):

        self.allLines = allLines

    def analyze(self):

        result = {}

        lineContent = ''.join(self.allLines)

        matches = WordAnalyzing.wordRegex.findall(lineContent)

        if matches:

            for word in matches:

                if result.get(word) is None:

                    result[word] = 0

                result[word] += 1

        return result

class FileObtainer(object):

    def __init__(self, dirpath, fileFilterFunc=None):

        self.dirpath = dirpath

        self.fileFilterFunc = fileFilterFunc

    def findAllFilesInDir(self):

        files = []

        for path, dirs, filenames in os.walk(self.dirpath):

            if len(filenames) > 0:

                for filename in filenames:

                    files.append(path+'/'+filename)

        if self.fileFilterFunc is None:

            return files

        else:

            return filter(self.fileFilterFunc, files)

class PostProcessing(object):

    def __init__(self, resultMap):

        self.resultMap = resultMap

    def sortByValue(self):

        return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True)

    def obtainTopN(self, topN):

        sortedResult = self.sortByValue()

        sortedNum = len(sortedResult)

        topN = sortedNum if topN > sortedNum else topN

        for i in range(topN):

            topi = sortedResult[i]

            print topi[0], ' counts: ', topi[1]

if __name__ == "__main__":

    dirpath = "c:\\Users\\qin.shuq\\Desktop\\region_master\\src"

    starttime = time.time()

    fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java'))

    fileList = fileObtainer.findAllFilesInDir()

    endtime = time.time()

    print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms'

    starttime = time.time()

    wr = WordReading(fileList)

    allLines = wr.readFile()

    endtime = time.time()

    print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms'

    starttime = time.time()

    wa = WordAnalyzing(allLines)

    resultMap = wa.analyze()

    endtime = time.time()

    print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms'

    starttime = time.time()

    postproc = PostProcessing(resultMap)

    postproc.obtainTopN(30)

    endtime = time.time()

    print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'
秒客网

python实现指定目录下批量文件的单词计数：串行版本

相关文章