python入门,2-目录文件列举和Beautiful Soup简单解析

功能:

  1.列举一个目录下的文件

  2.利用BeautifulSoup简单解析正文内容,然后保存

待完善:

  1.多线程支持

  2.适配器支持(for雷锋网和36氪两个网站网页)

"""
parser
    for parsing html file from leiphone.com and 36kr.com
    contact xiaoyang
"""

#
# @author:  xiaoyang
# @contact: hityixiaoyang@gmail.com
# @version:
# @describ: parse a html file from leiphone.com
# @log:
#           1.2012-11-22 create
#           2.2012-11-23 add FileCollect and ParseTask class
#

import sys
import urllib2
import codecs
import os
from bs4 import BeautifulSoup

# global def
OUT_FILE_PREFIX = "out"
OUT_CNT = 0

#
FileCollectDBG=False
ParseTaskDbg=True

def errPrint(code, msg=''):
    print >> sys.stderr, __doc__ % globals()
    if msg:
        print >> sys.stderr, msg
        sys.exit(code)

# for LeiPhone.com
def SaveResLP(doc,filename):
    print "!LOCK!"
    fp=None
    try:
        fp=open(filename,"w")
        fp.write(doc)
    except IOError as errStr:
        errPrint(1, errStr)
    finally:
        fp.close()
        print "!UNLOCK!"
    return True

# foe 36kr.com
def SaveRes36K(doc,filename):
   print "!LOCK!"
   print "!UNLOCK!"
   return True

class FileCollect:
        def __init__(self, root):
                self.root = root
                self.dlist = []
                self.flist = []
        def init(self):
                for root, dirs, files in os.walk(self.root):
                        self.dlist += dirs
                        for afile in files: 
                                self.flist.append(root + afile)
                return True

class ParseTask:
    def __init__(self, savedFileName):
                self.soup = None
                self.savedCnt = 0
                self.doneCnt = 0
                self.savedFileName = savedFileName
    def parse(self, readFileName):
        fp = None
        content = None
        try:
            fp = open(readFileName, "r")
            if fp is not None:
                self.soup = BeautifulSoup(fp.read())
            else:
                errPrint(1, "fopen failed!")
            content=self.soup.find_all()
            self.doneCnt=self.doneCnt+1
           
            if self.doneCnt >= self.savedCnt:
                SaveResLP(str(content[0]),self.savedFileName)
                self.doneCnt=0
        except IOError as errStr:
                        errPrint(1, errStr)
        finally:
                        if fp is not None:
                                fp.close()

if FileCollectDBG:              
    fc = FileCollect("/opt/project/")
    fc.init()
    print "dlist:\r\n", fc.dlist
    print "flist:\r\n", fc.flist
elif ParseTaskDbg:
    newTask=ParseTask("out.html")
    newTask.parse("1119-vv-dolby.html")
    print "saved OK!\r\n"