陆冠臻的期末报告

STEP1:爬取上市公司年报下载链接,保存为csv文件


import fitz
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

#获取行业分类中,航空运输业的上市公司基本信息,用于后续循环操作
doc = fitz.open('行业分类.pdf')

p = re.compile('(\d+)\n\*?(.*)')

result = []
for page in doc.pages(83,85):
    txt = page.get_text()
    result = result+p.findall(txt)

result = sorted(set(result), key=result.index)

beg = 0
end = 0
for t in result:
    if t[0]=='56':
       beg = result.index(t)
    elif t[0]=='58':
        end = result.index(t)

df = pd.DataFrame({'行业大类代码': result[beg][0],
                   '行业大类名称': result[beg][1],
                   '上市公司代码': [t[0] for t in result[beg+1:end]],
                   '上市公司简称': [t[1] for t in result[beg+1:end]]})

df.to_csv('行业信息.csv')

#爬取深交所上市公司年报链接
browser = webdriver.Edge()

class DisclosureTable_sz():
    '''
    解析深交所定期报告页搜索表格
    '''
    def __init__(self, innerHTML):
        self.html = innerHTML
        self.prefix = 'https://disc.szse.cn/download'
        self.prefix_href = 'https://www.szse.cn/'

        p_a = re.compile('(.*?)', re.DOTALL)
        p_span = re.compile('(.*?)', re.DOTALL)
        self.get_code = lambda txt: p_a.search(txt).group(1).strip()
        self.get_time = lambda txt: p_span.search(txt).group(1).strip()

        self.txt_to_df()

    def txt_to_df(self):
        html = self.html
        p = re.compile('(.*?)', re.DOTALL)
        trs = p.findall(html)

        p2 = re.compile('(.*?)', re.DOTALL)
        tds = [p2.findall(tr) for tr in trs[1:]]

        df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                           '简称': [td[1] for td in tds],
                           '公告标题': [td[2] for td in tds],
                           '公告时间': [td[3] for td in tds]})
        self.df_txt = df

    def get_link(self, txt):
        p_txt = '(.*?)'
        p = re.compile(p_txt, re.DOTALL)
        matchObj = p.search(txt)
        attachpath = matchObj.group(1).strip()
        href       = matchObj.group(2).strip()
        title      = matchObj.group(3).strip()
        return([attachpath, href, title])

    def get_data(self):
        get_code = self.get_code
        get_time = self.get_time
        get_link = self.get_link

        df = self.df_txt
        codes = [get_code(td) for td in df['证券代码']]
        short_names = [get_code(td) for td in df['简称']]
        ahts = [get_link(td) for td in df['公告标题']]
        times = [get_time(td) for td in df['公告时间']]

        prefix = self.prefix
        prefix_href = self.prefix_href
        df = pd.DataFrame({'证券代码': codes,
                           '简称': short_names,
                           '公告标题': [aht[2] for aht in ahts],
                           'attachpath': [prefix + aht[0] for aht in ahts],
                           'href': [prefix_href + aht[1] for aht in ahts],
                           '公告时间': times
            })
        self.df_data = df
        return(df)

browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
browser.implicitly_wait(5)

for name in df.iloc[0:4,3]:
    element = browser.find_element(By.ID, 'input_code')
    element.send_keys(name + Keys.RETURN)
    browser.find_element(By.LINK_TEXT, '请选择公告类别').click()
    browser.find_element(By.LINK_TEXT, '年度报告').click()
    y_start = browser.find_element(By.CLASS_NAME, 'input-left')
    y_start.send_keys('2013' + Keys.RETURN)
    y_end = browser.find_element(By.CLASS_NAME, 'input-right')
    y_end.send_keys('2023' + Keys.RETURN)
    time.sleep(1)

    element = browser.find_element(By.ID, 'disclosure-table')
    innerHTML = element.get_attribute('innerHTML')
    browser.find_element(By.CSS_SELECTOR, ".btn-clearall").click()
    html = innerHTML
    dt = DisclosureTable_sz(html)
    df1 = dt.get_data()
    p = re.compile(".*?(\*).*?")
    biaoti = [p.sub("",t) for t in df1['公告标题']]
    df1['公告标题'] = biaoti;del p,biaoti
    df1.to_csv(name+'.csv')

browser.quit()

#爬取上交所上市公司年报链接
class DisclosureTable_sh():
    '''
    解析上交所定期报告页搜索表格
    '''
    def __init__(self, innerHTML):
        self.html = innerHTML
        self.prefix_href = 'http://www.sse.com.cn/'

        p_span = re.compile('(.*?)', re.DOTALL)
        self.get_span = lambda txt: p_span.search(txt).group(1).strip()

        self.txt_to_df()

    def txt_to_df(self):
        html = self.html
        p = re.compile('(.+?)', re.DOTALL)
        trs = p.findall(html)

        p2 = re.compile('(.*?)', re.DOTALL)
        tds = [p2.findall(tr) for tr in trs[1:]]

        df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                           '简称': [td[1] for td in tds],
                           '公告标题': [td[2] for td in tds],
                           '公告时间': [td[3] for td in tds]})
        self.df_txt = df

    def get_link(self, txt):
        p_txt = '(.*?)'
        p = re.compile(p_txt, re.DOTALL)
        matchObj = p.search(txt)
        href     = matchObj.group(1).strip()
        title    = matchObj.group(2).strip()
        return([href, title])

    def get_data(self):
        get_span = self.get_span
        get_link = self.get_link

        df = self.df_txt
        codes = [get_span(td) for td in df['证券代码']]
        short_names = [get_span(td) for td in df['简称']]
        ahts = [get_link(td) for td in df['公告标题']]
        times = [td for td in df['公告时间']]

        prefix_href = self.prefix_href
        df = pd.DataFrame({'证券代码': codes,
                           '简称': short_names,
                           '公告标题': [aht[1] for aht in ahts],
                           'href': [prefix_href + aht[0] for aht in ahts],
                           '公告时间': times
            })
        self.df_data = df
        return(df)

def check_nextpage(driver):
    try:
        driver.find_element(By.LINK_TEXT, '下一页')
        return True
    except:
        return False

browser = webdriver.Edge()
browser.implicitly_wait(5)

i = 4
for code in df.iloc[4:,2]:
    browser.get('http://www.sse.com.cn/disclosure/listedinfo/regular/')
    time.sleep(1)
    element = browser.find_element(By.ID, 'inputCode')
    element.send_keys(code)
    browser.find_element(By.CSS_SELECTOR, ".sse_outerItem:nth-child(4) .filter-option-inner-inner").click()
    browser.find_element(By.LINK_TEXT, "年报").click()
    time.sleep(1)

    element = browser.find_element(By.CLASS_NAME, 'table-responsive')
    innerHTML = element.get_attribute('innerHTML')
    html = innerHTML
    dt = DisclosureTable_sh(html)
    df1 = dt.get_data()

    if check_nextpage(browser) == True:
        while True:
            nextpage = browser.find_element(By.LINK_TEXT,'下一页')
            nextpage.click()
            time.sleep(1)
            element = browser.find_element(By.CLASS_NAME, 'table-responsive')
            innerHTML = element.get_attribute('innerHTML')
            html = innerHTML
            dt = DisclosureTable_sh(html)
            df2 = dt.get_data()
            df1 = df1.append(df2)
            break
    df1.reset_index(drop=True,inplace=True)
    name = df.iloc[i,3]
    p = re.compile(".*?(\*).*?")
    biaoti = [p.sub("",t) for t in df1['公告标题']]
    df1['公告标题'] = biaoti;del p,biaoti
    df1.to_csv(name+'.csv')
    i = i+1

browser.quit()


结果展示

需要分析的航空运输业上市公司

结果截图

年报链接保存为csv文件

结果截图 结果截图

STEP2:对爬取的年报链接进行筛选,剔除一些非年度报告的链接(如摘要、专项报告等),留下近十年年度报告链接,保存为csv文件,并下载年报,存入相应文件夹


import pandas as pd
import requests
import os

df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))

#链接筛选
def filter_links(words,df,include = True):
    ls=[]
    for word in words:
        if include:
            ls.append([word in f for f in df['公告标题']])
        else:
            ls.append([word not in f for f in df['公告标题']])
    index = []
    for r in range(len(df)):
        flag = not include
        for c in range(len(words)):
            if include:
                flag = flag or ls[c][r]
            else:
                flag = flag and ls[c][r]
        index.append(flag)
    df2 = df[index]
    return(df2)

for name in df['上市公司简称']:
    df1 = pd.read_csv(name+'.csv',index_col=0,dtype=(str))
    df_all = filter_links(['摘要','意见','关于','情况','补充','说明','计划','公告'],df1,include = False)
    df_orig = filter_links(["(","("],df_all,include = False)
    df_orig.reset_index(drop=True,inplace=True)
    df_orig = df_orig.iloc[:10]
    df_orig.to_csv(name+'new.csv')

#建立文件夹,下载年报并放入文件夹
path = os.getcwd()
j = 0
for name in df['上市公司简称']:
    df1 = pd.read_csv(name+'new.csv',index_col=0,dtype=(str))
    os.makedirs(path+'\\'+name,exist_ok = True)
    os.chdir(path+'\\'+name)
    for i in range(len(df1)):
        href = df1.iloc[i,3]
        r = requests.get(href, allow_redirects=True)
        biaoti = df1.iloc[i,2]
        riqi = df1.iloc[i,-1]
        f = open(biaoti+'('+riqi+')'+'.pdf', 'wb')
        f.write(r.content)
        f.close()
        r.close()
        j = j+1
        print(name+'年报下载进度:'+str(i+1)+'/'+str(len(df1)))
    os.chdir('../')
print('总共下载完成'+str(j)+'份年报')


结果展示

筛选后的近十年年报链接,保存为csv文件

结果截图

详细内容可通过以下链接查看:

  • ST海航.csv
  • 白云机场.csv
  • 春秋航空.csv
  • 东航物流.csv
  • 华夏航空.csv
  • 吉祥航空.csv
  • 南方航空.csv
  • 厦门空港.csv
  • 山航B.csv
  • 上海机场.csv
  • 深圳机场.csv
  • 中国东航.csv
  • 中国国航.csv
  • 中信海直.csv
  • 下载年报,放入相应文件夹

    结果截图 结果截图 结果截图

    STEP3:从上市公司年报中提取营业收入、基本每股收益数据,保存为csv文件

    
    #提取营业收入
    import fitz
    import re
    import pandas as pd
    import os
    import numpy as np
    
    
    df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))
    
    path = os.getcwd()
    
    df_revenue = pd.DataFrame()
    
    for name in df['上市公司简称']:
        file_list = []
        for files in os.walk(path+'\\'+name):
            for file in files[2]:
                file_list.append(file)
    
        df1 = pd.DataFrame(columns=[name+'主营业务收入',name+'主营业务收入单位'])
    
        for file in file_list:
            doc = fitz.open(path+'\\'+name+'\\'+file)
            year = doc.name[-15:-11]
            year = int(year)
            year = year-1
            year = str(year)
    
            text = ''
            for page in doc:
                text += page.get_text()
                text = text.replace("�","")
                text = text.replace(" ","\n")
                text = text.replace("\n\n\n\n","\n")
                text = text.replace("\n\n\n","\n")
                text = text.replace("\n\n","\n")
            doc.close()
    
            p1 = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)(.*?)经营活动产生的|(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)(.*?)經營活動產生的',re.DOTALL)
            content = p1.search(text)
            if content != None:
                content = content.group(0)
                subp = "([0-9,.%\- ]*?\n)"
                p2 = re.compile("(?<=\\n)[营业|營業](\D*?\n+)(%s)" % subp)
                lines = p2.search(content)
                if lines != None:
                    lines = lines[2]
                    lines = lines.split('\n')
                    revenue = lines[0]
                    if ',' not in revenue:
                        subp = '([0-9,.%\- ]*?\n,?)'
                        p2 = re.compile("(?<=\\n)[营业|營業](\D*?\n+)(%s%s%s)" % (subp,subp,subp))
                        lines = p2.search(content)
                        if lines !=None:
                            lines = lines[2]
                            lines = lines.replace('\n','')
                            revenue = lines
                            if ',' not in revenue:
                                subp = '([0-9,.%\- ]*?\n?)'
                                p2 = re.compile("(?<=\\n)[营业|營業]([\D\d\n]*?)(%s)(?=\\n0)" % subp, re.DOTALL)
                                lines = p2.search(content)
                                if lines !=None:
                                    lines = lines[2]
                                    lines = lines.replace('\n','')
                                    revenue = lines
                                    if ',' not in revenue:
                                        print(name+year+'年年报营业收入查找可能出错,请手动检查')
                                        revenue = np.nan
                                else:
                                    print(name+year+'年年报营业收入查找可能出错,请手动检查')
                                    revenue = np.nan
                        else:
                            print(name+year+'年年报营业收入查找可能出错,请手动检查')
                            revenue = np.nan
                    p3 = re.compile('(?<=\n)\D*?单位:?(.*?)(?=\n)|(?<=\n)單位:?(.*?)(?=\n)',re.DOTALL)
                    danwei = p3.search(content)
                    if danwei != None:
                        if danwei.group(1) !=None:
                            danwei = danwei.group(1)
                            danwei = danwei.replace(')', '')
                            if '元' not in danwei :
                                danwei = np.nan
                                print(name+year+'年年报营业收入单位查找可能出错,请手动检查')
                        else:
                            danwei = danwei.group(2)
                            danwei = danwei.replace(')', '')
                            if '元' not in danwei :
                                danwei = np.nan
                                print(name+year+'年年报营业收入单位查找可能出错,请手动检查')
                    else:
                        danwei = '元'
                    df1.loc[year] = [revenue,danwei]
                else:
                    print(name+year+'年年报营业收入查找失败')
            else:
                print(name+year+'年年报财务数据文本定位失败')
    
        df1 = df1.sort_index()
        df_revenue = pd.concat([df_revenue,df1],axis=1)
    
    #无法正常提取的进行手动填充
    df_revenue.loc['2013','中国东航主营业务收入'] = '88,009,236'
    df_revenue.loc['2013','中国东航主营业务收入单位'] = '千元'
    
    df_revenue = df_revenue.reset_index(drop=False)
    df_revenue.rename(columns = {"index":"年份"}, inplace=True)
    df_revenue = df_revenue.set_index(['年份'])
    
    df_revenue.to_csv('航空运输业主营业务收入数据raw.csv')
    
    #把数字里的逗号去掉,并转换为浮点数
    for i in range(len(df_revenue)):
        for j in range(0,len(df_revenue.columns),2):
            if pd.notnull(df_revenue.iloc[i,j]):
                df_revenue.iloc[i,j] = df_revenue.iloc[i,j].replace(',','')
                df_revenue.iloc[i,j] = float(df_revenue.iloc[i,j])
    
    #把单位统一转换为元
    for i in range(len(df_revenue)):
        for j in range(1,len(df_revenue.columns),2):
            if pd.notnull(df_revenue.iloc[i,j]):
                if '百万' in df_revenue.iloc[i,j]:
                    df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*1000000
                    df_revenue.iloc[i,j] = '元'
                elif '百萬' in df_revenue.iloc[i,j]:
                    df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*1000000
                    df_revenue.iloc[i,j] = '元'
                elif '万' in df_revenue.iloc[i,j]:
                    df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*10000
                    df_revenue.iloc[i,j] = '元'
                elif '千' in df_revenue.iloc[i,j]:
                    df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*1000
                    df_revenue.iloc[i,j] = '元'
    
    df_revenue.drop(df_revenue.columns[[1,3,5,7,9,11,13,15,17,19,21,23,25,27]], axis=1, inplace=True)
    
    df_revenue.to_csv('航空运输业主营业务收入数据.csv')
    
    #提取基本每股收益
    import fitz
    import re
    import pandas as pd
    import os
    import numpy as np
    
    
    df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))
    
    path = os.getcwd()
    
    df_eps = pd.DataFrame()
    
    for name in df['上市公司简称']:
        file_list = []
        for files in os.walk(path+'\\'+name):
            for file in files[2]:
                file_list.append(file)
    
        df1 = pd.DataFrame(columns=[name+'基本每股收益'])
    
        for file in file_list:
            doc = fitz.open(path+'\\'+name+'\\'+file)
            year = doc.name[-15:-11]
            year = int(year)
            year = year-1
            year = str(year)
    
            text = ''
            for page in doc:
                text += page.get_text()
                text = text.replace("�","")
                text = text.replace(" ","\n")
                text = text.replace("\n\n\n\n","\n")
                text = text.replace("\n\n\n","\n")
                text = text.replace("\n\n","\n")
            doc.close()
    
            p1 = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)(.*?)稀释每股\D*?收益|(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)(.*?)稀釋每股\D*?收益',re.DOTALL)
            content = p1.search(text)
            if content != None:
                content = content.group(0)
                subp = "([0-9.%\-() ]*?\n)"
                p2 = re.compile("(?<=\\n)基本每股\D*?收益\D*?\n+(%s)" % subp)
                lines = p2.search(content)
                if lines != None:
                    lines = lines[1]
                    lines = lines.replace('\n', '')
                    lines = lines.replace('(', '-')
                    lines = lines.replace(')', '')
                    eps = lines
                    df1.loc[year] = [eps]
                else:
                    p1 = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)(.*?)经营活动产生的|(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)(.*?)經營活動產生的',re.DOTALL)
                    content = p1.search(text)
                    if content != None:
                        content = content.group(0)
                        subp = "([0-9.%\-() ]*?\n)"
                        p2 = re.compile("(?<=\\n)每股收益\D*?元\D*?\n+(%s)" % subp)
                        lines = p2.search(content)
                        if lines != None:
                            lines = lines[1]
                            lines = lines.replace('\n', '')
                            lines = lines.replace('(', '-')
                            lines = lines.replace(')', '')
                            eps = lines
                            df1.loc[year] = [eps]
                        else:
                            print(name+year+'年年报基本每股收益查找失败')
                    else:
                        print(name+year+'年年报基本每股收益查找失败')
            else:
                print(name+year+'年年报财务数据文本定位失败')
        df1 = df1.sort_index()
        df_eps = pd.concat([df_eps,df1],axis=1)
    
    #无法正常提取的进行手动填充
    df_eps.loc['2013','中国东航基本每股收益'] = '0.1965'
    
    df_eps = df_eps.reset_index(drop=False)
    df_eps.rename(columns = {"index":"年份"}, inplace=True)
    df_eps = df_eps.set_index(['年份'])
    
    df_eps.to_csv('航空运输业基本每股收益数据.csv')
    
    
    

    一些需要考虑的问题

    1. 不同年报营业收入对应的单位不尽相同,一般为元,部分年报采用百万或千元等,故利用正则表达式提取营业收入时也需提取单位,后续再进行单位统一化处理。
    2. 部分年报在pdf中打开为简体字,但通过fitz导入后变为繁体字,故编写正则表达式时要加入繁体的版本。
    3. 部分年报格式、文本信息混乱,无法利用正则表达式提取相应数据(120份年报中共2份无法正常提取),故进行手动查找。

    单位的问题

    结果截图 结果截图

    字体繁简的问题

    结果截图 结果截图

    年报格式的问题

    文本顺序完全打乱,数据被拆分重新排列组合。

    比如2013年中国东航年报(东方航空2013年年报.pdf)的营业收入,本来为'88,009,236',但是导入后完整的数字被拆开并打乱顺序。

    结果截图

    结果展示

    营业收入数据提取(nan表示该年度公司未上市,没有营业收入数据)

    结果截图

    详细内容可通过以下链接查看:

  • 航空运输业主营业务收入数据.csv
  • 基本每股收益数据提取(nan表示该年度公司未上市,没有基本每股收益数据)

    结果截图

    详细内容可通过以下链接查看:

  • 航空运输业基本每股收益数据.csv
  • STEP4:从上市公司年报中提取股票简称、股票代码、办公地址、公司网址的信息,并保存文件

    
    import fitz
    import re
    import pandas as pd
    import os
    import numpy as np
    
    
    df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))
    
    path = os.getcwd()
    
    df_information = pd.DataFrame()
    
    for name in df['上市公司简称']:
        file_list = []
        for files in os.walk(path+'\\'+name):
            for file in files[2]:
                file_list.append(file)
    
        df1 = pd.DataFrame(columns=[name+'股票简称',name+'股票代码',name+'办公地址',name+'公司网址'])
    
        for file in file_list:
            doc = fitz.open(path+'\\'+name+'\\'+file)
            year = doc.name[-15:-11]
            year = int(year)
            year = year-1
            year = str(year)
    
            text = ''
            for page in doc:
                text += page.get_text()
                text = text.replace("�","")
                text = text.replace(" ","\n")
                text = text.replace("\n\n\n\n","\n")
                text = text.replace("\n\n\n","\n")
                text = text.replace("\n\n","\n")
            doc.close()
    
            p1 = re.compile('(?<=\\n)\D、?\n*公司信息(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)|(?<=\\n)[\D、]?\n*公司資料(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)',re.DOTALL)
            content = p1.search(text)
            if content != None:
                content = content.group(0)
                p2 = re.compile("(?<=\\n)\D*?简称:?\n*(.*?\n[B]?).*?(?=\\n)|(?<=\\n)\D*?簡稱:?\n*(.*?\n[B]?).*?(?=\\n)",re.DOTALL)
                lines = p2.search(content)
                if lines !=None:
                    if lines[1] !=None:
                        short_name = lines[1].replace('\n', '')
                        if '股票' in short_name:
                            p2 = re.compile("(?<=\\n)\D*?证券交易所\n+(.*?)(?=\\n)",re.DOTALL)
                            lines = p2.search(content)
                            if lines !=None:
                                short_name = lines[1].replace('\n', '')
                            else:
                                print(name+year+'年年报股票简称查找失败')
                                short_name = np.nan
                    else:
                        short_name = lines[2].replace('\n', '')
                else:
                   print(name+year+'年年报股票简称查找失败')
                   short_name = np.nan
                p3 = re.compile("(?<=\\n)股票代码.*?(\d+)(?=\\n)|(?<=\\n)\D*?代碼.*?(\d+)(?=\\n)",re.DOTALL)
                lines = p3.search(content)
                if lines !=None:
                    if lines[1] !=None:
                        code = lines[1]
                        if len(code)<6:
                            p3 = re.compile("(?<=\\n)\D*?A股.*?(\d+)(?=\\n)",re.DOTALL)
                            lines = p3.search(content)
                            if lines !=None:
                                code = lines[1]
                            else:
                                print(name+year+'年年报股票代码查找失败')
                                code = np.nan
                    else:
                        code = lines[2]
                else:
                   print(name+year+'年年报股票代码查找失败')
                   code = np.nan
                p4 = re.compile("(?<=\\n)\D*?办公地址:?\n+(.*)(?=\\n\D*?办公地址的邮政编码)|(?<=\\n)\D*?辦公地址:?\n+(.*?)(?=\\n)",re.DOTALL)
                lines = p4.search(content)
                if lines !=None:
                    if lines[1] !=None:
                        address = lines[1].replace('\n', '')
                    else:
                        address = lines[2].replace('\n', '')
                else:
                   print(name+year+'年年报办公地址查找失败')
                   address = np.nan
                p5 = re.compile("(?<=\\n)公司网址\n+(.*?)(电?子?传真\D*?|移动应用\D*?|手机网址\D*?|移动网址\D*?|电子信箱)(?=\\n)",re.DOTALL)
                lines = p5.search(content)
                if lines !=None:
                    web = lines[1].replace('\n', '')
                else:
                    p5 = re.compile("(?<=\\n)公司网址:?\n*(.*?)(?=\\n)|(?<=\\n)公司網址:?\n*(.*?)(?=\\n)",re.DOTALL)
                    lines = p5.search(content)
                    if lines !=None:
                        if lines[1] !=None:
                            web = lines[1].replace('\n', '')
                        else:
                            web = lines[2].replace('\n', '')
                    else:
                        print(name+year+'年年报公司网址查找失败')
                        web = np.nan
                df1.loc[year] = [short_name,code,address,web]
            else:
                p1 = re.compile('(?<=\\n)[\D、]?\n*公司信息(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)|(?<=\\n)[\D、]?\n*公司資料(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)',re.DOTALL)
                content = p1.search(text)
                if content != None:
                    content = content.group(0)
                    p2 = re.compile("(?<=\\n)\D*?简称:?\n*(.*?\n[B]?).*?(?=\\n)|(?<=\\n)\D*?簡稱:?\n*(.*?\n[B]?).*?(?=\\n)",re.DOTALL)
                    lines = p2.search(content)
                    if lines !=None:
                        if lines[1] !=None:
                            short_name = lines[1].replace('\n', '')
                            if '股票' in short_name:
                                p2 = re.compile("(?<=\\n)\D*?证券交易所\n+(.*?)(?=\\n)",re.DOTALL)
                                lines = p2.search(content)
                                if lines !=None:
                                    short_name = lines[1].replace('\n', '')
                                else:
                                    print(name+year+'年年报股票简称查找失败')
                                    short_name = np.nan
                        else:
                            short_name = lines[2].replace('\n', '')
                    else:
                       print(name+year+'年年报股票简称查找失败')
                       short_name = np.nan
                    p3 = re.compile("(?<=\\n)股票代码.*?(\d+)(?=\\n)|(?<=\\n)\D*?代碼.*?(\d+)(?=\\n)",re.DOTALL)
                    lines = p3.search(content)
                    if lines !=None:
                        if lines[1] !=None:
                            code = lines[1]
                            if len(code)<6:
                                p3 = re.compile("(?<=\\n)\D*?A股.*?(\d+)(?=\\n)",re.DOTALL)
                                lines = p3.search(content)
                                if lines !=None:
                                    code = lines[1]
                                else:
                                    print(name+year+'年年报股票代码查找失败')
                                    code = np.nan
                        else:
                            code = lines[2]
                    else:
                       print(name+year+'年年报股票代码查找失败')
                       code = np.nan
                    p4 = re.compile("(?<=\\n)\D*?办公地址:?\n+(.*)(?=\\n\D*?办公地址的邮政编码)|(?<=\\n)\D*?辦公地址:?\n+(.*?)(?=\\n)",re.DOTALL)
                    lines = p4.search(content)
                    if lines !=None:
                        if lines[1] !=None:
                            address = lines[1].replace('\n', '')
                        else:
                            address = lines[2].replace('\n', '')
                    else:
                       print(name+year+'年年报办公地址查找失败')
                       address = np.nan
                    p5 = re.compile("(?<=\\n)公司网址\n+(.*?)(电?子?传真\D*?|移动应用\D*?|手机网址\D*?|移动网址\D*?|电子信箱)(?=\\n)",re.DOTALL)
                    lines = p5.search(content)
                    if lines !=None:
                        web = lines[1].replace('\n', '')
                    else:
                        p5 = re.compile("(?<=\\n)公司网址:?\n*(.*?)(?=\\n)|(?<=\\n)公司網址:?\n*(.*?)(?=\\n)",re.DOTALL)
                        lines = p5.search(content)
                        if lines !=None:
                            if lines[1] !=None:
                                web = lines[1].replace('\n', '')
                            else:
                                web = lines[2].replace('\n', '')
                        else:
                            print(name+year+'年年报公司网址查找失败')
                            web = np.nan
                    df1.loc[year] = [short_name,code,address,web]
                else:
                    df1.loc[year] = [np.nan,np.nan,np.nan,np.nan]
                    print(name+year+'年年报公司基本信息文本定位失败')
    
        df1 = df1.sort_index()
        df_information = pd.concat([df_information,df1],axis=1)
    
    #无法正常提取的进行手动填充
    df_information.loc['2013','中国东航股票简称'] = '东方航空'
    df_information.loc['2013','中国东航股票代码'] = '600115'
    df_information.loc['2013','中国东航办公地址'] = '上海市虹桥路2550号'
    df_information.loc['2013','中国东航公司网址'] = 'www.ceair.com'
    
    df_information.loc['2018','中国国航股票简称'] = '中国国航'
    df_information.loc['2018','中国国航股票代码'] = '601111'
    df_information.loc['2018','中国国航办公地址'] = '中国北京市顺义区空港工业区天柱路30号'
    df_information.loc['2018','中国国航公司网址'] = 'www.airchina.com.cn'
    
    df_information = df_information.reset_index(drop=False)
    df_information.rename(columns = {"index":"年份"}, inplace=True)
    df_information = df_information.set_index(['年份'])
    
    df_information.fillna(value='未上市', inplace=True)
    df_information.to_csv('航空运输业上市公司基本信息.csv')
    
    
    

    结果展示

    上市公司基本信息

    结果截图 结果截图 结果截图

    详细内容可通过以下链接查看:

  • 航空运输业上市公司基本信息.csv
  • STEP5:绘图

    
    import pandas as pd
    import matplotlib.pyplot as plt
    from pylab import mpl
    mpl.rcParams['font.sans-serif']=['SimHei']
    mpl.rcParams['axes.unicode_minus']=False
    
    
    df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))
    df_eps = pd.read_csv('航空运输业基本每股收益数据.csv',index_col=0)
    df_revenue = pd.read_csv('航空运输业主营业务收入数据.csv',index_col=0)
    df_information = pd.read_csv('航空运输业上市公司基本信息.csv',index_col=0,dtype=(str))
    df_revenue = df_revenue/100000000
    
    df_revenue.loc['sum'] = df_revenue.sum()
    df_revenue = df_revenue.T
    df_revenue = df_revenue.sort_values(by='sum',ascending=False,axis=0)
    df_revenue = df_revenue.iloc[:10]
    
    top10_list = df_revenue.index.values.tolist()
    for i in range(len(top10_list)):
        top10_list[i] = top10_list[i][:-6]
    
    df_revenue = df_revenue.T
    df_revenue = df_revenue.iloc[:-1]
    
    for name in df['上市公司简称']:
        if name not in top10_list:
            df_eps.drop(columns=[name+'基本每股收益'], axis=1, inplace=True)
    
    for name in top10_list:
        df_revenue.rename(columns={name+'主营业务收入':name}, inplace=True)
        df_eps.rename(columns={name+'基本每股收益':name}, inplace=True)
    
    
    #主营业务收入
    plt.figure(figsize=(10,8))
    x = df_revenue.index
    y_1 = df_revenue.iloc[:,0]
    y_2 = df_revenue.iloc[:,1]
    y_3 = df_revenue.iloc[:,2]
    y_4 = df_revenue.iloc[:,3]
    y_5 = df_revenue.iloc[:,4]
    
    plt.plot(x, y_1, marker='^', markersize=8, label=df_revenue.columns[0], linewidth=2.0)
    plt.plot(x, y_2, marker='^', markersize=8, label=df_revenue.columns[1], linewidth=2.0)
    plt.plot(x, y_3, marker='^', markersize=8, label=df_revenue.columns[2], linewidth=2.0)
    plt.plot(x, y_4, marker='^', markersize=8, label=df_revenue.columns[3], linewidth=2.0)
    plt.plot(x, y_5, marker='^', markersize=8, label=df_revenue.columns[4], linewidth=2.0)
    
    plt.xticks(range(2012,2022), fontsize=16)
    plt.xlabel("年份", fontsize=16)
    plt.yticks(fontsize=16)
    plt.ylabel("主营业务收入(亿元)", fontsize=16)
    plt.title("2012-2021年航空运输业上市公司主营业务收入随时间变化趋势图", fontsize=16)
    plt.legend(loc=1, prop={'size':15})
    plt.grid()
    
    #主营业务收入(续)
    plt.figure(figsize=(10,8))
    y_6 = df_revenue.iloc[:,5]
    y_7 = df_revenue.iloc[:,6]
    y_8 = df_revenue.iloc[:,7]
    y_9 = df_revenue.iloc[:,8]
    y_10 = df_revenue.iloc[:,9]
    
    plt.plot(x, y_6, marker='^', markersize=8, label=df_revenue.columns[5], linewidth=2.0)
    plt.plot(x, y_7, marker='^', markersize=8, label=df_revenue.columns[6], linewidth=2.0)
    plt.plot(x, y_8, marker='^', markersize=8, label=df_revenue.columns[7], linewidth=2.0)
    plt.plot(x, y_9, marker='^', markersize=8, label=df_revenue.columns[8], linewidth=2.0)
    plt.plot(x, y_10, marker='^', markersize=8, label=df_revenue.columns[9], linewidth=2.0)
    
    plt.xticks(range(2012,2022), fontsize=16)
    plt.xlabel("年份", fontsize=16)
    plt.yticks(fontsize=16)
    plt.ylabel("主营业务收入(亿元)", fontsize=16)
    plt.title("2012-2021年航空运输业上市公司主营业务收入随时间变化趋势图(续)", fontsize=16)
    plt.legend(loc=1, prop={'size': 15})
    plt.grid()
    
    
    
    #基本每股收益
    plt.figure(figsize=(10,8))
    x = df_eps.index
    y_1 = df_eps.iloc[:,4]
    y_2 = df_eps.iloc[:,8]
    y_3 = df_eps.iloc[:,5]
    y_4 = df_eps.iloc[:,6]
    y_5 = df_eps.iloc[:,1]
    
    plt.plot(x, y_1, marker='s', markersize=7, label=df_eps.columns[4], linewidth=2.0)
    plt.plot(x, y_2, marker='s', markersize=7, label=df_eps.columns[8], linewidth=2.0)
    plt.plot(x, y_3, marker='s', markersize=7, label=df_eps.columns[5], linewidth=2.0)
    plt.plot(x, y_4, marker='s', markersize=7, label=df_eps.columns[6], linewidth=2.0)
    plt.plot(x, y_5, marker='s', markersize=7, label=df_eps.columns[1], linewidth=2.0)
    
    plt.xticks(range(2012,2022), fontsize=16)
    plt.xlabel("年份", fontsize=16)
    plt.yticks(fontsize=16)
    plt.ylabel("基本每股收益(元/股)", fontsize=16)
    plt.title("2012-2021年航空运输业上市公司基本每股收益随时间变化趋势图", fontsize=16)
    plt.legend(loc=0, prop={'size':15})
    plt.grid()
    
    #基本每股收益(续)
    plt.figure(figsize=(10,8))
    y_6 = df_eps.iloc[:,9]
    y_7 = df_eps.iloc[:,7]
    y_8 = df_eps.iloc[:,3]
    y_9 = df_eps.iloc[:,2]
    y_10 = df_eps.iloc[:,0]
    
    plt.plot(x, y_6, marker='s', markersize=7, label=df_eps.columns[9], linewidth=2.0)
    plt.plot(x, y_7, marker='s', markersize=7, label=df_eps.columns[7], linewidth=2.0)
    plt.plot(x, y_8, marker='s', markersize=7, label=df_eps.columns[3], linewidth=2.0)
    plt.plot(x, y_9, marker='s', markersize=7, label=df_eps.columns[2], linewidth=2.0)
    plt.plot(x, y_10, marker='s', markersize=7, label=df_eps.columns[0], linewidth=2.0)
    
    plt.xticks(range(2012,2022), fontsize=16)
    plt.xlabel("年份", fontsize=16)
    plt.yticks(fontsize=16)
    plt.ylabel("基本每股收益(元/股)", fontsize=16)
    plt.title("2012-2021年航空运输业上市公司基本每股收益随时间变化趋势图(续)", fontsize=16)
    plt.legend(loc=1, prop={'size': 15})
    plt.grid()
    
    
    
    
    #2012-2016主营业务收入横向对比
    df_revenue[:5].plot(kind='bar', figsize=(10,8), width=0.6)
    plt.xticks(fontsize=16, rotation=0)
    plt.xlabel('年份', fontsize=16,rotation=0)
    plt.yticks(fontsize=16)
    plt.ylabel('主营业务收入(亿元)', fontsize=16)
    plt.title('2012-2016年航空运输业上市公司主营业务收入横向对比图', fontsize=16)
    plt.legend(loc=1, prop={'size':14})
    plt.grid()
    
    #2017-2021主营业务收入横向对比
    df_revenue[5:].plot(kind='bar', figsize=(10,8), width=0.6)
    plt.xticks(fontsize=16, rotation=0)
    plt.xlabel('年份', fontsize=16,rotation=0)
    plt.yticks(fontsize=16)
    plt.ylabel('主营业务收入(亿元)', fontsize=16)
    plt.title('2017-2021年航空运输业上市公司主营业务收入横向对比图', fontsize=16)
    plt.legend(loc=1, prop={'size':14})
    plt.grid()
    
    #2012-2016基本每股收益横向对比
    df_eps.iloc[:5,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
    plt.xticks(fontsize=16, rotation=0)
    plt.xlabel('年份', fontsize=16,rotation=0)
    plt.yticks(fontsize=16)
    plt.ylabel('基本每股收益(元/股)', fontsize=16)
    plt.title('2012-2016年航空运输业上市公司基本每股收益横向对比图', fontsize=16)
    plt.legend(loc=1, prop={'size':14})
    plt.grid()
    
    #2017-2021基本每股收益横向对比
    df_eps.iloc[5:,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
    plt.xticks(fontsize=16, rotation=0)
    plt.xlabel('年份', fontsize=16,rotation=0)
    plt.yticks(fontsize=16)
    plt.ylabel('基本每股收益(元/股)', fontsize=16)
    plt.title('2017-2021年航空运输业上市公司基本每股收益横向对比图', fontsize=16)
    plt.legend(loc=1, prop={'size':14}, ncol=2)
    plt.grid()
    
    
    

    结果展示

    营业收入前十的上市公司营业收入随时间变化趋势图

    结果截图 结果截图

    营业收入前十的上市公司基本每股收益随时间变化趋势图

    结果截图 结果截图

    营业收入前十的上市公司营业收入横向对比图

    结果截图 结果截图

    营业收入前十的上市公司基本每股收益横向对比图

    结果截图 结果截图

    航空运输业解读与分析