陆冠臻的期末报告

STEP1:爬取上市公司年报下载链接，保存为csv文件


import fitz
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

#获取行业分类中，航空运输业的上市公司基本信息，用于后续循环操作
doc = fitz.open('行业分类.pdf')

p = re.compile('(\d+)\n\*?(.*)')

result = []
for page in doc.pages(83,85):
    txt = page.get_text()
    result = result+p.findall(txt)

result = sorted(set(result), key=result.index)

beg = 0
end = 0
for t in result:
    if t[0]=='56':
       beg = result.index(t)
    elif t[0]=='58':
        end = result.index(t)

df = pd.DataFrame({'行业大类代码': result[beg][0],
                   '行业大类名称': result[beg][1],
                   '上市公司代码': [t[0] for t in result[beg+1:end]],
                   '上市公司简称': [t[1] for t in result[beg+1:end]]})

df.to_csv('行业信息.csv')

#爬取深交所上市公司年报链接
browser = webdriver.Edge()

class DisclosureTable_sz():
    '''
    解析深交所定期报告页搜索表格
    '''
    def __init__(self, innerHTML):
        self.html = innerHTML
        self.prefix = 'https://disc.szse.cn/download'
        self.prefix_href = 'https://www.szse.cn/'

        p_a = re.compile('(.*?)', re.DOTALL)
        p_span = re.compile('(.*?)', re.DOTALL)
        self.get_code = lambda txt: p_a.search(txt).group(1).strip()
        self.get_time = lambda txt: p_span.search(txt).group(1).strip()

        self.txt_to_df()

    def txt_to_df(self):
        html = self.html
        p = re.compile('(.*?)', re.DOTALL)
        trs = p.findall(html)

        p2 = re.compile('(.*?)', re.DOTALL)
        tds = [p2.findall(tr) for tr in trs[1:]]

        df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                           '简称': [td[1] for td in tds],
                           '公告标题': [td[2] for td in tds],
                           '公告时间': [td[3] for td in tds]})
        self.df_txt = df

    def get_link(self, txt):
        p_txt = '(.*?)'
        p = re.compile(p_txt, re.DOTALL)
        matchObj = p.search(txt)
        attachpath = matchObj.group(1).strip()
        href       = matchObj.group(2).strip()
        title      = matchObj.group(3).strip()
        return([attachpath, href, title])

    def get_data(self):
        get_code = self.get_code
        get_time = self.get_time
        get_link = self.get_link

        df = self.df_txt
        codes = [get_code(td) for td in df['证券代码']]
        short_names = [get_code(td) for td in df['简称']]
        ahts = [get_link(td) for td in df['公告标题']]
        times = [get_time(td) for td in df['公告时间']]

        prefix = self.prefix
        prefix_href = self.prefix_href
        df = pd.DataFrame({'证券代码': codes,
                           '简称': short_names,
                           '公告标题': [aht[2] for aht in ahts],
                           'attachpath': [prefix + aht[0] for aht in ahts],
                           'href': [prefix_href + aht[1] for aht in ahts],
                           '公告时间': times
            })
        self.df_data = df
        return(df)

browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
browser.implicitly_wait(5)

for name in df.iloc[0:4,3]:
    element = browser.find_element(By.ID, 'input_code')
    element.send_keys(name + Keys.RETURN)
    browser.find_element(By.LINK_TEXT, '请选择公告类别').click()
    browser.find_element(By.LINK_TEXT, '年度报告').click()
    y_start = browser.find_element(By.CLASS_NAME, 'input-left')
    y_start.send_keys('2013' + Keys.RETURN)
    y_end = browser.find_element(By.CLASS_NAME, 'input-right')
    y_end.send_keys('2023' + Keys.RETURN)
    time.sleep(1)

    element = browser.find_element(By.ID, 'disclosure-table')
    innerHTML = element.get_attribute('innerHTML')
    browser.find_element(By.CSS_SELECTOR, ".btn-clearall").click()
    html = innerHTML
    dt = DisclosureTable_sz(html)
    df1 = dt.get_data()
    p = re.compile(".*?(\*).*?")
    biaoti = [p.sub("",t) for t in df1['公告标题']]
    df1['公告标题'] = biaoti;del p,biaoti
    df1.to_csv(name+'.csv')

browser.quit()

#爬取上交所上市公司年报链接
class DisclosureTable_sh():
    '''
    解析上交所定期报告页搜索表格
    '''
    def __init__(self, innerHTML):
        self.html = innerHTML
        self.prefix_href = 'http://www.sse.com.cn/'

        p_span = re.compile('(.*?)', re.DOTALL)
        self.get_span = lambda txt: p_span.search(txt).group(1).strip()

        self.txt_to_df()

    def txt_to_df(self):
        html = self.html
        p = re.compile('(.+?)', re.DOTALL)
        trs = p.findall(html)

        p2 = re.compile('(.*?)', re.DOTALL)
        tds = [p2.findall(tr) for tr in trs[1:]]

        df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                           '简称': [td[1] for td in tds],
                           '公告标题': [td[2] for td in tds],
                           '公告时间': [td[3] for td in tds]})
        self.df_txt = df

    def get_link(self, txt):
        p_txt = '(.*?)'
        p = re.compile(p_txt, re.DOTALL)
        matchObj = p.search(txt)
        href     = matchObj.group(1).strip()
        title    = matchObj.group(2).strip()
        return([href, title])

    def get_data(self):
        get_span = self.get_span
        get_link = self.get_link

        df = self.df_txt
        codes = [get_span(td) for td in df['证券代码']]
        short_names = [get_span(td) for td in df['简称']]
        ahts = [get_link(td) for td in df['公告标题']]
        times = [td for td in df['公告时间']]

        prefix_href = self.prefix_href
        df = pd.DataFrame({'证券代码': codes,
                           '简称': short_names,
                           '公告标题': [aht[1] for aht in ahts],
                           'href': [prefix_href + aht[0] for aht in ahts],
                           '公告时间': times
            })
        self.df_data = df
        return(df)

def check_nextpage(driver):
    try:
        driver.find_element(By.LINK_TEXT, '下一页')
        return True
    except:
        return False

browser = webdriver.Edge()
browser.implicitly_wait(5)

i = 4
for code in df.iloc[4:,2]:
    browser.get('http://www.sse.com.cn/disclosure/listedinfo/regular/')
    time.sleep(1)
    element = browser.find_element(By.ID, 'inputCode')
    element.send_keys(code)
    browser.find_element(By.CSS_SELECTOR, ".sse_outerItem:nth-child(4) .filter-option-inner-inner").click()
    browser.find_element(By.LINK_TEXT, "年报").click()
    time.sleep(1)

    element = browser.find_element(By.CLASS_NAME, 'table-responsive')
    innerHTML = element.get_attribute('innerHTML')
    html = innerHTML
    dt = DisclosureTable_sh(html)
    df1 = dt.get_data()

    if check_nextpage(browser) == True:
        while True:
            nextpage = browser.find_element(By.LINK_TEXT,'下一页')
            nextpage.click()
            time.sleep(1)
            element = browser.find_element(By.CLASS_NAME, 'table-responsive')
            innerHTML = element.get_attribute('innerHTML')
            html = innerHTML
            dt = DisclosureTable_sh(html)
            df2 = dt.get_data()
            df1 = df1.append(df2)
            break
    df1.reset_index(drop=True,inplace=True)
    name = df.iloc[i,3]
    p = re.compile(".*?(\*).*?")
    biaoti = [p.sub("",t) for t in df1['公告标题']]
    df1['公告标题'] = biaoti;del p,biaoti
    df1.to_csv(name+'.csv')
    i = i+1

browser.quit()

结果展示

需要分析的航空运输业上市公司

年报链接保存为csv文件

STEP2:对爬取的年报链接进行筛选，剔除一些非年度报告的链接（如摘要、专项报告等），留下近十年年度报告链接，保存为csv文件，并下载年报，存入相应文件夹


import pandas as pd
import requests
import os

df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))

#链接筛选
def filter_links(words,df,include = True):
    ls=[]
    for word in words:
        if include:
            ls.append([word in f for f in df['公告标题']])
        else:
            ls.append([word not in f for f in df['公告标题']])
    index = []
    for r in range(len(df)):
        flag = not include
        for c in range(len(words)):
            if include:
                flag = flag or ls[c][r]
            else:
                flag = flag and ls[c][r]
        index.append(flag)
    df2 = df[index]
    return(df2)

for name in df['上市公司简称']:
    df1 = pd.read_csv(name+'.csv',index_col=0,dtype=(str))
    df_all = filter_links(['摘要','意见','关于','情况','补充','说明','计划','公告'],df1,include = False)
    df_orig = filter_links(["（","("],df_all,include = False)
    df_orig.reset_index(drop=True,inplace=True)
    df_orig = df_orig.iloc[:10]
    df_orig.to_csv(name+'new.csv')

#建立文件夹，下载年报并放入文件夹
path = os.getcwd()
j = 0
for name in df['上市公司简称']:
    df1 = pd.read_csv(name+'new.csv',index_col=0,dtype=(str))
    os.makedirs(path+'\\'+name,exist_ok = True)
    os.chdir(path+'\\'+name)
    for i in range(len(df1)):
        href = df1.iloc[i,3]
        r = requests.get(href, allow_redirects=True)
        biaoti = df1.iloc[i,2]
        riqi = df1.iloc[i,-1]
        f = open(biaoti+'('+riqi+')'+'.pdf', 'wb')
        f.write(r.content)
        f.close()
        r.close()
        j = j+1
        print(name+'年报下载进度:'+str(i+1)+'/'+str(len(df1)))
    os.chdir('../')
print('总共下载完成'+str(j)+'份年报')

结果展示

筛选后的近十年年报链接，保存为csv文件

详细内容可通过以下链接查看：

下载年报，放入相应文件夹

STEP3：从上市公司年报中提取营业收入、基本每股收益数据，保存为csv文件


#提取营业收入
import fitz
import re
import pandas as pd
import os
import numpy as np


df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))

path = os.getcwd()

df_revenue = pd.DataFrame()

for name in df['上市公司简称']:
    file_list = []
    for files in os.walk(path+'\\'+name):
        for file in files[2]:
            file_list.append(file)

    df1 = pd.DataFrame(columns=[name+'主营业务收入',name+'主营业务收入单位'])

    for file in file_list:
        doc = fitz.open(path+'\\'+name+'\\'+file)
        year = doc.name[-15:-11]
        year = int(year)
        year = year-1
        year = str(year)

        text = ''
        for page in doc:
            text += page.get_text()
            text = text.replace("�","")
            text = text.replace(" ","\n")
            text = text.replace("\n\n\n\n","\n")
            text = text.replace("\n\n\n","\n")
            text = text.replace("\n\n","\n")
        doc.close()

        p1 = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)(.*?)经营活动产生的|(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)(.*?)經營活動產生的',re.DOTALL)
        content = p1.search(text)
        if content != None:
            content = content.group(0)
            subp = "([0-9,.%\- ]*?\n)"
            p2 = re.compile("(?<=\\n)[营业|營業](\D*?\n+)(%s)" % subp)
            lines = p2.search(content)
            if lines != None:
                lines = lines[2]
                lines = lines.split('\n')
                revenue = lines[0]
                if ',' not in revenue:
                    subp = '([0-9,.%\- ]*?\n,?)'
                    p2 = re.compile("(?<=\\n)[营业|營業](\D*?\n+)(%s%s%s)" % (subp,subp,subp))
                    lines = p2.search(content)
                    if lines !=None:
                        lines = lines[2]
                        lines = lines.replace('\n','')
                        revenue = lines
                        if ',' not in revenue:
                            subp = '([0-9,.%\- ]*?\n?)'
                            p2 = re.compile("(?<=\\n)[营业|營業]([\D\d\n]*?)(%s)(?=\\n0)" % subp, re.DOTALL)
                            lines = p2.search(content)
                            if lines !=None:
                                lines = lines[2]
                                lines = lines.replace('\n','')
                                revenue = lines
                                if ',' not in revenue:
                                    print(name+year+'年年报营业收入查找可能出错，请手动检查')
                                    revenue = np.nan
                            else:
                                print(name+year+'年年报营业收入查找可能出错，请手动检查')
                                revenue = np.nan
                    else:
                        print(name+year+'年年报营业收入查找可能出错，请手动检查')
                        revenue = np.nan
                p3 = re.compile('(?<=\n)\D*?单位：?(.*?)(?=\n)|(?<=\n)單位：?(.*?)(?=\n)',re.DOTALL)
                danwei = p3.search(content)
                if danwei != None:
                    if danwei.group(1) !=None:
                        danwei = danwei.group(1)
                        danwei = danwei.replace(')', '')
                        if '元' not in danwei :
                            danwei = np.nan
                            print(name+year+'年年报营业收入单位查找可能出错，请手动检查')
                    else:
                        danwei = danwei.group(2)
                        danwei = danwei.replace(')', '')
                        if '元' not in danwei :
                            danwei = np.nan
                            print(name+year+'年年报营业收入单位查找可能出错，请手动检查')
                else:
                    danwei = '元'
                df1.loc[year] = [revenue,danwei]
            else:
                print(name+year+'年年报营业收入查找失败')
        else:
            print(name+year+'年年报财务数据文本定位失败')

    df1 = df1.sort_index()
    df_revenue = pd.concat([df_revenue,df1],axis=1)

#无法正常提取的进行手动填充
df_revenue.loc['2013','中国东航主营业务收入'] = '88,009,236'
df_revenue.loc['2013','中国东航主营业务收入单位'] = '千元'

df_revenue = df_revenue.reset_index(drop=False)
df_revenue.rename(columns = {"index":"年份"}, inplace=True)
df_revenue = df_revenue.set_index(['年份'])

df_revenue.to_csv('航空运输业主营业务收入数据raw.csv')

#把数字里的逗号去掉，并转换为浮点数
for i in range(len(df_revenue)):
    for j in range(0,len(df_revenue.columns),2):
        if pd.notnull(df_revenue.iloc[i,j]):
            df_revenue.iloc[i,j] = df_revenue.iloc[i,j].replace(',','')
            df_revenue.iloc[i,j] = float(df_revenue.iloc[i,j])

#把单位统一转换为元
for i in range(len(df_revenue)):
    for j in range(1,len(df_revenue.columns),2):
        if pd.notnull(df_revenue.iloc[i,j]):
            if '百万' in df_revenue.iloc[i,j]:
                df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*1000000
                df_revenue.iloc[i,j] = '元'
            elif '百萬' in df_revenue.iloc[i,j]:
                df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*1000000
                df_revenue.iloc[i,j] = '元'
            elif '万' in df_revenue.iloc[i,j]:
                df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*10000
                df_revenue.iloc[i,j] = '元'
            elif '千' in df_revenue.iloc[i,j]:
                df_revenue.iloc[i,j-1] = df_revenue.iloc[i,j-1]*1000
                df_revenue.iloc[i,j] = '元'

df_revenue.drop(df_revenue.columns[[1,3,5,7,9,11,13,15,17,19,21,23,25,27]], axis=1, inplace=True)

df_revenue.to_csv('航空运输业主营业务收入数据.csv')

#提取基本每股收益
import fitz
import re
import pandas as pd
import os
import numpy as np


df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))

path = os.getcwd()

df_eps = pd.DataFrame()

for name in df['上市公司简称']:
    file_list = []
    for files in os.walk(path+'\\'+name):
        for file in files[2]:
            file_list.append(file)

    df1 = pd.DataFrame(columns=[name+'基本每股收益'])

    for file in file_list:
        doc = fitz.open(path+'\\'+name+'\\'+file)
        year = doc.name[-15:-11]
        year = int(year)
        year = year-1
        year = str(year)

        text = ''
        for page in doc:
            text += page.get_text()
            text = text.replace("�","")
            text = text.replace(" ","\n")
            text = text.replace("\n\n\n\n","\n")
            text = text.replace("\n\n\n","\n")
            text = text.replace("\n\n","\n")
        doc.close()

        p1 = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)(.*?)稀释每股\D*?收益|(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)(.*?)稀釋每股\D*?收益',re.DOTALL)
        content = p1.search(text)
        if content != None:
            content = content.group(0)
            subp = "([0-9.%\-() ]*?\n)"
            p2 = re.compile("(?<=\\n)基本每股\D*?收益\D*?\n+(%s)" % subp)
            lines = p2.search(content)
            if lines != None:
                lines = lines[1]
                lines = lines.replace('\n', '')
                lines = lines.replace('(', '-')
                lines = lines.replace(')', '')
                eps = lines
                df1.loc[year] = [eps]
            else:
                p1 = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)(.*?)经营活动产生的|(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)(.*?)經營活動產生的',re.DOTALL)
                content = p1.search(text)
                if content != None:
                    content = content.group(0)
                    subp = "([0-9.%\-() ]*?\n)"
                    p2 = re.compile("(?<=\\n)每股收益\D*?元\D*?\n+(%s)" % subp)
                    lines = p2.search(content)
                    if lines != None:
                        lines = lines[1]
                        lines = lines.replace('\n', '')
                        lines = lines.replace('(', '-')
                        lines = lines.replace(')', '')
                        eps = lines
                        df1.loc[year] = [eps]
                    else:
                        print(name+year+'年年报基本每股收益查找失败')
                else:
                    print(name+year+'年年报基本每股收益查找失败')
        else:
            print(name+year+'年年报财务数据文本定位失败')
    df1 = df1.sort_index()
    df_eps = pd.concat([df_eps,df1],axis=1)

#无法正常提取的进行手动填充
df_eps.loc['2013','中国东航基本每股收益'] = '0.1965'

df_eps = df_eps.reset_index(drop=False)
df_eps.rename(columns = {"index":"年份"}, inplace=True)
df_eps = df_eps.set_index(['年份'])

df_eps.to_csv('航空运输业基本每股收益数据.csv')

一些需要考虑的问题

不同年报营业收入对应的单位不尽相同，一般为元，部分年报采用百万或千元等，故利用正则表达式提取营业收入时也需提取单位，后续再进行单位统一化处理。
部分年报在pdf中打开为简体字，但通过fitz导入后变为繁体字，故编写正则表达式时要加入繁体的版本。
部分年报格式、文本信息混乱，无法利用正则表达式提取相应数据（120份年报中共2份无法正常提取），故进行手动查找。

单位的问题

字体繁简的问题

年报格式的问题

文本顺序完全打乱，数据被拆分重新排列组合。

比如2013年中国东航年报（东方航空2013年年报.pdf）的营业收入，本来为'88,009,236'，但是导入后完整的数字被拆开并打乱顺序。

结果展示

营业收入数据提取（nan表示该年度公司未上市，没有营业收入数据）

详细内容可通过以下链接查看：

航空运输业主营业务收入数据.csv

基本每股收益数据提取（nan表示该年度公司未上市，没有基本每股收益数据）

详细内容可通过以下链接查看：

航空运输业基本每股收益数据.csv

STEP4:从上市公司年报中提取股票简称、股票代码、办公地址、公司网址的信息，并保存文件


import fitz
import re
import pandas as pd
import os
import numpy as np


df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))

path = os.getcwd()

df_information = pd.DataFrame()

for name in df['上市公司简称']:
    file_list = []
    for files in os.walk(path+'\\'+name):
        for file in files[2]:
            file_list.append(file)

    df1 = pd.DataFrame(columns=[name+'股票简称',name+'股票代码',name+'办公地址',name+'公司网址'])

    for file in file_list:
        doc = fitz.open(path+'\\'+name+'\\'+file)
        year = doc.name[-15:-11]
        year = int(year)
        year = year-1
        year = str(year)

        text = ''
        for page in doc:
            text += page.get_text()
            text = text.replace("�","")
            text = text.replace(" ","\n")
            text = text.replace("\n\n\n\n","\n")
            text = text.replace("\n\n\n","\n")
            text = text.replace("\n\n","\n")
        doc.close()

        p1 = re.compile('(?<=\\n)\D、?\n*公司信息(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)|(?<=\\n)[\D、]?\n*公司資料(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)',re.DOTALL)
        content = p1.search(text)
        if content != None:
            content = content.group(0)
            p2 = re.compile("(?<=\\n)\D*?简称：?\n*(.*?\n[B]?).*?(?=\\n)|(?<=\\n)\D*?簡稱：?\n*(.*?\n[B]?).*?(?=\\n)",re.DOTALL)
            lines = p2.search(content)
            if lines !=None:
                if lines[1] !=None:
                    short_name = lines[1].replace('\n', '')
                    if '股票' in short_name:
                        p2 = re.compile("(?<=\\n)\D*?证券交易所\n+(.*?)(?=\\n)",re.DOTALL)
                        lines = p2.search(content)
                        if lines !=None:
                            short_name = lines[1].replace('\n', '')
                        else:
                            print(name+year+'年年报股票简称查找失败')
                            short_name = np.nan
                else:
                    short_name = lines[2].replace('\n', '')
            else:
               print(name+year+'年年报股票简称查找失败')
               short_name = np.nan
            p3 = re.compile("(?<=\\n)股票代码.*?(\d+)(?=\\n)|(?<=\\n)\D*?代碼.*?(\d+)(?=\\n)",re.DOTALL)
            lines = p3.search(content)
            if lines !=None:
                if lines[1] !=None:
                    code = lines[1]
                    if len(code)<6:
                        p3 = re.compile("(?<=\\n)\D*?A股.*?(\d+)(?=\\n)",re.DOTALL)
                        lines = p3.search(content)
                        if lines !=None:
                            code = lines[1]
                        else:
                            print(name+year+'年年报股票代码查找失败')
                            code = np.nan
                else:
                    code = lines[2]
            else:
               print(name+year+'年年报股票代码查找失败')
               code = np.nan
            p4 = re.compile("(?<=\\n)\D*?办公地址：?\n+(.*)(?=\\n\D*?办公地址的邮政编码)|(?<=\\n)\D*?辦公地址：?\n+(.*?)(?=\\n)",re.DOTALL)
            lines = p4.search(content)
            if lines !=None:
                if lines[1] !=None:
                    address = lines[1].replace('\n', '')
                else:
                    address = lines[2].replace('\n', '')
            else:
               print(name+year+'年年报办公地址查找失败')
               address = np.nan
            p5 = re.compile("(?<=\\n)公司网址\n+(.*?)(电?子?传真\D*?|移动应用\D*?|手机网址\D*?|移动网址\D*?|电子信箱)(?=\\n)",re.DOTALL)
            lines = p5.search(content)
            if lines !=None:
                web = lines[1].replace('\n', '')
            else:
                p5 = re.compile("(?<=\\n)公司网址：?\n*(.*?)(?=\\n)|(?<=\\n)公司網址：?\n*(.*?)(?=\\n)",re.DOTALL)
                lines = p5.search(content)
                if lines !=None:
                    if lines[1] !=None:
                        web = lines[1].replace('\n', '')
                    else:
                        web = lines[2].replace('\n', '')
                else:
                    print(name+year+'年年报公司网址查找失败')
                    web = np.nan
            df1.loc[year] = [short_name,code,address,web]
        else:
            p1 = re.compile('(?<=\\n)[\D、]?\n*公司信息(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?数据\D*?(?=\\n)|(?<=\\n)[\D、]?\n*公司資料(?=\\n)(.*?)(?<=\\n)[\D、]?\D*?主要\D*?數據\D*?(?=\\n)',re.DOTALL)
            content = p1.search(text)
            if content != None:
                content = content.group(0)
                p2 = re.compile("(?<=\\n)\D*?简称：?\n*(.*?\n[B]?).*?(?=\\n)|(?<=\\n)\D*?簡稱：?\n*(.*?\n[B]?).*?(?=\\n)",re.DOTALL)
                lines = p2.search(content)
                if lines !=None:
                    if lines[1] !=None:
                        short_name = lines[1].replace('\n', '')
                        if '股票' in short_name:
                            p2 = re.compile("(?<=\\n)\D*?证券交易所\n+(.*?)(?=\\n)",re.DOTALL)
                            lines = p2.search(content)
                            if lines !=None:
                                short_name = lines[1].replace('\n', '')
                            else:
                                print(name+year+'年年报股票简称查找失败')
                                short_name = np.nan
                    else:
                        short_name = lines[2].replace('\n', '')
                else:
                   print(name+year+'年年报股票简称查找失败')
                   short_name = np.nan
                p3 = re.compile("(?<=\\n)股票代码.*?(\d+)(?=\\n)|(?<=\\n)\D*?代碼.*?(\d+)(?=\\n)",re.DOTALL)
                lines = p3.search(content)
                if lines !=None:
                    if lines[1] !=None:
                        code = lines[1]
                        if len(code)<6:
                            p3 = re.compile("(?<=\\n)\D*?A股.*?(\d+)(?=\\n)",re.DOTALL)
                            lines = p3.search(content)
                            if lines !=None:
                                code = lines[1]
                            else:
                                print(name+year+'年年报股票代码查找失败')
                                code = np.nan
                    else:
                        code = lines[2]
                else:
                   print(name+year+'年年报股票代码查找失败')
                   code = np.nan
                p4 = re.compile("(?<=\\n)\D*?办公地址：?\n+(.*)(?=\\n\D*?办公地址的邮政编码)|(?<=\\n)\D*?辦公地址：?\n+(.*?)(?=\\n)",re.DOTALL)
                lines = p4.search(content)
                if lines !=None:
                    if lines[1] !=None:
                        address = lines[1].replace('\n', '')
                    else:
                        address = lines[2].replace('\n', '')
                else:
                   print(name+year+'年年报办公地址查找失败')
                   address = np.nan
                p5 = re.compile("(?<=\\n)公司网址\n+(.*?)(电?子?传真\D*?|移动应用\D*?|手机网址\D*?|移动网址\D*?|电子信箱)(?=\\n)",re.DOTALL)
                lines = p5.search(content)
                if lines !=None:
                    web = lines[1].replace('\n', '')
                else:
                    p5 = re.compile("(?<=\\n)公司网址：?\n*(.*?)(?=\\n)|(?<=\\n)公司網址：?\n*(.*?)(?=\\n)",re.DOTALL)
                    lines = p5.search(content)
                    if lines !=None:
                        if lines[1] !=None:
                            web = lines[1].replace('\n', '')
                        else:
                            web = lines[2].replace('\n', '')
                    else:
                        print(name+year+'年年报公司网址查找失败')
                        web = np.nan
                df1.loc[year] = [short_name,code,address,web]
            else:
                df1.loc[year] = [np.nan,np.nan,np.nan,np.nan]
                print(name+year+'年年报公司基本信息文本定位失败')

    df1 = df1.sort_index()
    df_information = pd.concat([df_information,df1],axis=1)

#无法正常提取的进行手动填充
df_information.loc['2013','中国东航股票简称'] = '东方航空'
df_information.loc['2013','中国东航股票代码'] = '600115'
df_information.loc['2013','中国东航办公地址'] = '上海市虹桥路2550号'
df_information.loc['2013','中国东航公司网址'] = 'www.ceair.com'

df_information.loc['2018','中国国航股票简称'] = '中国国航'
df_information.loc['2018','中国国航股票代码'] = '601111'
df_information.loc['2018','中国国航办公地址'] = '中国北京市顺义区空港工业区天柱路30号'
df_information.loc['2018','中国国航公司网址'] = 'www.airchina.com.cn'

df_information = df_information.reset_index(drop=False)
df_information.rename(columns = {"index":"年份"}, inplace=True)
df_information = df_information.set_index(['年份'])

df_information.fillna(value='未上市', inplace=True)
df_information.to_csv('航空运输业上市公司基本信息.csv')

结果展示

上市公司基本信息

详细内容可通过以下链接查看：

航空运输业上市公司基本信息.csv

STEP5:绘图


import pandas as pd
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False


df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))
df_eps = pd.read_csv('航空运输业基本每股收益数据.csv',index_col=0)
df_revenue = pd.read_csv('航空运输业主营业务收入数据.csv',index_col=0)
df_information = pd.read_csv('航空运输业上市公司基本信息.csv',index_col=0,dtype=(str))
df_revenue = df_revenue/100000000

df_revenue.loc['sum'] = df_revenue.sum()
df_revenue = df_revenue.T
df_revenue = df_revenue.sort_values(by='sum',ascending=False,axis=0)
df_revenue = df_revenue.iloc[:10]

top10_list = df_revenue.index.values.tolist()
for i in range(len(top10_list)):
    top10_list[i] = top10_list[i][:-6]

df_revenue = df_revenue.T
df_revenue = df_revenue.iloc[:-1]

for name in df['上市公司简称']:
    if name not in top10_list:
        df_eps.drop(columns=[name+'基本每股收益'], axis=1, inplace=True)

for name in top10_list:
    df_revenue.rename(columns={name+'主营业务收入':name}, inplace=True)
    df_eps.rename(columns={name+'基本每股收益':name}, inplace=True)


#主营业务收入
plt.figure(figsize=(10,8))
x = df_revenue.index
y_1 = df_revenue.iloc[:,0]
y_2 = df_revenue.iloc[:,1]
y_3 = df_revenue.iloc[:,2]
y_4 = df_revenue.iloc[:,3]
y_5 = df_revenue.iloc[:,4]

plt.plot(x, y_1, marker='^', markersize=8, label=df_revenue.columns[0], linewidth=2.0)
plt.plot(x, y_2, marker='^', markersize=8, label=df_revenue.columns[1], linewidth=2.0)
plt.plot(x, y_3, marker='^', markersize=8, label=df_revenue.columns[2], linewidth=2.0)
plt.plot(x, y_4, marker='^', markersize=8, label=df_revenue.columns[3], linewidth=2.0)
plt.plot(x, y_5, marker='^', markersize=8, label=df_revenue.columns[4], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("主营业务收入（亿元）", fontsize=16)
plt.title("2012-2021年航空运输业上市公司主营业务收入随时间变化趋势图", fontsize=16)
plt.legend(loc=1, prop={'size':15})
plt.grid()

#主营业务收入(续)
plt.figure(figsize=(10,8))
y_6 = df_revenue.iloc[:,5]
y_7 = df_revenue.iloc[:,6]
y_8 = df_revenue.iloc[:,7]
y_9 = df_revenue.iloc[:,8]
y_10 = df_revenue.iloc[:,9]

plt.plot(x, y_6, marker='^', markersize=8, label=df_revenue.columns[5], linewidth=2.0)
plt.plot(x, y_7, marker='^', markersize=8, label=df_revenue.columns[6], linewidth=2.0)
plt.plot(x, y_8, marker='^', markersize=8, label=df_revenue.columns[7], linewidth=2.0)
plt.plot(x, y_9, marker='^', markersize=8, label=df_revenue.columns[8], linewidth=2.0)
plt.plot(x, y_10, marker='^', markersize=8, label=df_revenue.columns[9], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("主营业务收入（亿元）", fontsize=16)
plt.title("2012-2021年航空运输业上市公司主营业务收入随时间变化趋势图(续)", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()



#基本每股收益
plt.figure(figsize=(10,8))
x = df_eps.index
y_1 = df_eps.iloc[:,4]
y_2 = df_eps.iloc[:,8]
y_3 = df_eps.iloc[:,5]
y_4 = df_eps.iloc[:,6]
y_5 = df_eps.iloc[:,1]

plt.plot(x, y_1, marker='s', markersize=7, label=df_eps.columns[4], linewidth=2.0)
plt.plot(x, y_2, marker='s', markersize=7, label=df_eps.columns[8], linewidth=2.0)
plt.plot(x, y_3, marker='s', markersize=7, label=df_eps.columns[5], linewidth=2.0)
plt.plot(x, y_4, marker='s', markersize=7, label=df_eps.columns[6], linewidth=2.0)
plt.plot(x, y_5, marker='s', markersize=7, label=df_eps.columns[1], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益（元/股）", fontsize=16)
plt.title("2012-2021年航空运输业上市公司基本每股收益随时间变化趋势图", fontsize=16)
plt.legend(loc=0, prop={'size':15})
plt.grid()

#基本每股收益(续)
plt.figure(figsize=(10,8))
y_6 = df_eps.iloc[:,9]
y_7 = df_eps.iloc[:,7]
y_8 = df_eps.iloc[:,3]
y_9 = df_eps.iloc[:,2]
y_10 = df_eps.iloc[:,0]

plt.plot(x, y_6, marker='s', markersize=7, label=df_eps.columns[9], linewidth=2.0)
plt.plot(x, y_7, marker='s', markersize=7, label=df_eps.columns[7], linewidth=2.0)
plt.plot(x, y_8, marker='s', markersize=7, label=df_eps.columns[3], linewidth=2.0)
plt.plot(x, y_9, marker='s', markersize=7, label=df_eps.columns[2], linewidth=2.0)
plt.plot(x, y_10, marker='s', markersize=7, label=df_eps.columns[0], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益（元/股）", fontsize=16)
plt.title("2012-2021年航空运输业上市公司基本每股收益随时间变化趋势图(续)", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()




#2012-2016主营业务收入横向对比
df_revenue[:5].plot(kind='bar', figsize=(10,8), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('主营业务收入（亿元）', fontsize=16)
plt.title('2012-2016年航空运输业上市公司主营业务收入横向对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()

#2017-2021主营业务收入横向对比
df_revenue[5:].plot(kind='bar', figsize=(10,8), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('主营业务收入（亿元）', fontsize=16)
plt.title('2017-2021年航空运输业上市公司主营业务收入横向对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()

#2012-2016基本每股收益横向对比
df_eps.iloc[:5,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('基本每股收益（元/股）', fontsize=16)
plt.title('2012-2016年航空运输业上市公司基本每股收益横向对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()

#2017-2021基本每股收益横向对比
df_eps.iloc[5:,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('基本每股收益（元/股）', fontsize=16)
plt.title('2017-2021年航空运输业上市公司基本每股收益横向对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14}, ncol=2)
plt.grid()

结果展示

营业收入前十的上市公司营业收入随时间变化趋势图

营业收入前十的上市公司基本每股收益随时间变化趋势图

营业收入前十的上市公司营业收入横向对比图

营业收入前十的上市公司基本每股收益横向对比图

航空运输业解读与分析

纵向分析

航空运输行业在2020年之前发展稳中向好。这主要得益于我国GDP的稳步增长，居民消费水平的不断提高，在长途旅行中选择航班出行逐渐普遍化。从所选出的行业内十家具有一定代表性的公司的主营业务收入时间序列图来看， 2012到2019年末营业收入均呈现稳步上升的趋势，基本每股收益大部分保持稳定或上升，仅个别公司基本每股收益波动比较大。

随着时间进入2020年，年初爆发新冠疫情，并持续了较长一段时间，我国实行了严格的交通管制、出入境管制，国际航线和国内航线大部分停运，给航空运输业造成了巨大的打击。营业收入几乎腰斩，基本每股收益也大幅下降，可以看到2020年十家上市公司基本每股收益清一色都为负数。

随着疫情得到有效控制，国际航线和国内航线逐渐恢复，业内各上市公司的营业状况也有所好转，营业收入和基本每股收益都有一定程度的上升，但基本每股收益也基本还是维持在负数的水平，上市公司大多入不敷出。虽然疫情得到了有效控制，但航空运输业要恢复到疫情之前的水平，还是需要一定的时间。

横向分析

从行业横向对比来看，南方航空、中国国航、中国东航三大龙头的营业收入遥遥领先，龙头地位十分稳固，在疫情之前，南方航空和中国国航的营业收入水平都基本稳定在千亿以上，中国东航也在2017年突破千亿水平。由于这三家上市公司体量巨大，流通股份数较多，所以基本每股收益维持在较小的0.2-0.4元/股左右。而行业内其他公司，从横向对比图可以看出，营业收入基本没有达到第一龙头南方航空的20%，除海航控股（ST海航），其营业收入水平基本在中国东航的30%-50%左右，有成为第四大龙头的态势。

总的来说，航空运输业行业内部的竞争结构还是比较稳定的，基本上以南方航空、中国国航、中国东航三大龙头为主导，其他公司仅占较小的市场份额。