吴菲的实验报告

代码一


  from selenium import webdriver
  from selenium.webdriver.common.by import By
  from selenium.webdriver.common.keys import Keys
  import time

  def Inputtime(start,end):#定义输入开始和截至时间的函数
        START = browser.find_element(By.CLASS_NAME,"input-left")
        END = browser.find_element(By.CLASS_NAME,"input-right")
        START.send_keys(start)
        END.send_keys(end+Keys.RETURN)
  def Save(filename,content):
        f = open(filename+'.html','w',encoding='utf-8')
        f.write(content)
        f.close()
  x1=['奥飞娱乐','齐心集团','高乐股份','明牌珠宝','珠江钢琴','金一文化','萃华珠宝',' 爱迪尔','实丰文化','英派斯','海伦钢琴','德艺文创','金陵体育','创源股份','金马游乐','华立科技']
  browser = webdriver.Edge()
  browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')#打开深证证交所的网页
  time.sleep(5)
  End = time.strftime('%Y-%m-%d', time.localtime())
  Inputtime('2012-01-01',End)
  for x in x1:
      time.sleep(1)#延迟1秒执行
      element = browser.find_element(By.ID, 'input_code')  # Find the search box
      element.send_keys(x+Keys.RETURN)
      element=browser.find_element(By.LINK_TEXT, '请选择公告类别')
      element.click()
      element=browser.find_element(By.LINK_TEXT, '年度报告')
      element.click()
      time.sleep(2)#延迟2秒执行
      element = browser.find_element(By.ID, 'disclosure-table')
      time.sleep(5)#延迟5秒执行
      innerHTML = element.get_attribute('innerHTML')
      Save(x,innerHTML)
      browser.find_element(By.CLASS_NAME,"btn-clearall").click()
  #browser.quit()

  import re
  import pandas as pd
  import requests
  import os
  class DisclosureTable():
      '''
      解析深交所定期报告页搜索表格
      '''
      def __init__(self, innerHTML):
          self.html = innerHTML
          self.prefix = 'https://disc.szse.cn/download'
          self.prefix_href = 'https://www.szse.cn/'
          #
          p_a = re.compile('(.*?)', re.DOTALL)
          p_span = re.compile('(.*?)', re.DOTALL)
          self.get_code = lambda txt: p_a.search(txt).group(1).strip()
          self.get_time = lambda txt: p_span.search(txt).group(1).strip()
          #
          self.txt_to_df()

      def txt_to_df(self):
          # html table text to DataFrame
          html = self.html
          p = re.compile('(.*?)', re.DOTALL)
          trs = p.findall(html)

          p2 = re.compile('(.*?)', re.DOTALL)
          tds = [p2.findall(tr) for tr in trs[1:]]

          df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                             '简称': [td[1] for td in tds],
                             '公告标题': [td[2] for td in tds],
                             '公告时间': [td[3] for td in tds]})
          self.df_txt = df

      def get_link(self, txt):
          p_txt = '(.*?)'
          p = re.compile(p_txt, re.DOTALL)
          matchObj = p.search(txt)
          attachpath = matchObj.group(1).strip()
          href       = matchObj.group(2).strip()
          title      = matchObj.group(3).strip()
          return([attachpath, href, title])

      def get_data(self):
          get_code = self.get_code
          get_time = self.get_time
          get_link = self.get_link
          #
          df = self.df_txt
          codes = [get_code(td) for td in df['证券代码']]
          short_names = [get_code(td) for td in df['简称']]
          ahts = [get_link(td) for td in df['公告标题']]
          times = [get_time(td) for td in df['公告时间']]
          #
          prefix = self.prefix
          prefix_href = self.prefix
          df = pd.DataFrame({'证券代码': codes,
                             '简称': short_names,
                             '公告标题': [aht[2] for aht in ahts],
                             'attachpath': [prefix + aht[0] for aht in ahts],
                             'href': [prefix_href + aht[1] for aht in ahts],
                             '公告时间': times
              })
          self.df_data = df
          return(df)

  def Readhtml(filename):
      f = open(filename+'.html', encoding='utf-8')
      html = f.read()
      f.close()
      return html

  def Clean(df):#清除“摘要”型、“(已取消)”型文件
      d = []
      for index, row in df.iterrows():
          y = df.shape[0]
          for x in range(y):
            ggbt = df.iloc[x,2]
            a = re.search("摘要|取消", ggbt)
            if a != None:
                d.append(x)
      df1 = df.drop(d).reset_index(drop = True)
      return df1

  def Loadpdf(df):#用于下载文件
      d1 = {}
      for index, row in df.iterrows():
          d1[row[2]] = row[3]
      for key, value in d1.items():
          f = requests.get(value)
          fo = open (key+".pdf", "wb")
          fo.write(f.content)
  x1=['奥飞娱乐','齐心集团','高乐股份','明牌珠宝','珠江钢琴','金一文化','萃华珠宝',' 爱迪尔','实丰文化','英派斯','海伦钢琴','德艺文创','金陵体育','创源股份','金马游乐','华立科技']
  for x in x1: #下载深圳证券交易所的年报
      html = Readhtml(x)
      dt = DisclosureTable(html)
      dt1 = dt.get_data()
      df = Clean(dt1)
      df.to_csv(""+x+".csv",encoding="utf-8-sig")
      os.makedirs("C:/Users/橙子小姐/.spyder-py3/nb_sz/"+x,exist_ok=True)#将年报保存到该文件夹
      os.chdir("C:/Users/橙子小姐/.spyder-py3/nb_sz/"+x)
      Loadpdf(df)
      os.chdir(r"C:\Users\橙子小姐\.spyder-py3")
  print(x,"公司年报已保存完毕")

结果

结果截图 结果截图

解释

代码一为下载相关公司年报 获取营业额、股票等信息未运行成功 无法进行下一步