赵嘉明的作业二

代码


  # 方法一：定义函数，运用正则表达式来提取
  ##替换常用HTML字符实体.
  #使用正常的字符替换HTML中特殊的字符实体.
  #你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
  #@param htmlstr HTML字符串.
  def replaceCharEntity(htmlstr):
      CHAR_ENTITIES={'nbsp':' ','160':' ',
                  'lt':'<','60':'<',
                  'gt':'>','62':'>',
                  'amp':'&','38':'&',
                  'quot':'"','34':'"',}

      re_charEntity=re.compile(r'&#?(?P\w+);')
      sz=re_charEntity.search(htmlstr)
      while sz:
          entity=sz.group()#entity全称，如>
          key=sz.group('name')#去除&;后entity,如>为gt
          try:
              htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1)
              sz=re_charEntity.search(htmlstr)
          except KeyError:
              #以空串代替
              htmlstr=re_charEntity.sub('',htmlstr,1)
              sz=re_charEntity.search(htmlstr)
      return htmlstr

  def repalce(s,re_exp,repl_string):
      return re_exp.sub(repl_string,s)
  ##过滤HTML中的标签
  #将HTML中标签等信息去掉
  #@param htmlstr HTML字符串.
  def filter_tags(htmlstr):
      #先过滤CDATA
      re_cdata=re.compile('//]*//\]\]>',re.I) #匹配CDATA
      re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script
      re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style
      re_br=re.compile('')#处理换行
      re_h=re.compile(']*>')#HTML标签
      re_comment=re.compile('')#HTML注释
      s=re_cdata.sub('',htmlstr)#去掉CDATA
      s=re_script.sub('',s) #去掉SCRIPT
      s=re_style.sub('',s)#去掉style
      s=re_br.sub('',s)#将br转换为换行
      s=re_h.sub('',s) #去掉HTML 标签
      s=re_comment.sub('',s)#去掉HTML注释
      #去掉多余的空行
      blank_line=re.compile('\n+')
      s=blank_line.sub('',s)
      blank_line_l=re.compile('\n')
      s=blank_line_l.sub('',s)
      blank_kon=re.compile('\t')
      s=blank_kon.sub('',s)
      blank_one=re.compile('\r\n')
      s=blank_one.sub('',s)
      blank_two=re.compile('\r')
      s=blank_two.sub('',s)
      blank_three=re.compile(' ')
      s=blank_three.sub('',s)
      s=replaceCharEntity(s)#替换实体
      return s

  filter_tags(x.text)

  #方法二 简单易用，使用 get_text()提取当前你要求的网页中的所有标签，仅仅返回字符串（string）类型的文本字符串
   soup = BeautifulSoup(x.text,from_encoding='utf-8-sig')
   string = soup.get_text()
   text = string.replace('\n','')
   text.replace(u'\xa0', '')
   print(text)
结果

解释

方法一较为麻烦，效率低;方法二，利用beautifulsoup模块，用 get_text()非常快捷，调用get_text(）永远是你在打印，存储或者操纵数据之前最后要做的事情，一般情况下，要尽可能的去尝试