姓名:李怡
学号:4201489
■引入所需库
import requests
import re
import pandas as pd
import os
import fitz
import time
import openpyxl
■选择饮料行业,将文件格式变为.xls
xlsx = '饮料行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple], 'f_name':[t[1] for t in list_of_tuple]})
df2.to_csv('饮料行业.csv')
■用正则表达式筛选标题,标准化年报文件名
df = pd.read_csv('饮料行业.csv')
p = re.compile('(?<=\d{4}(年报)|(年年报)')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
idnex = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
ls.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2=df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','(',],df_all,include=[True])
df_updt = filter_links(['取消'], df_updt,include=[False])
■创建文件夹10companies
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig=[]
index_updt=[]
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return ((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.i;oc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
# index_orig,index_updt = sub_with_update(df_updt, df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn=df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv=os.listdir('10companies')
■输出符合条件的年报
os.chdir('C:/Users/这夏/Documents/python/10companies')
f_1=os.listdir()
f_1.remove(f_1[0])
f_1.remove(f_1[4])
links= []
f_names=[]
links = df['link']; f_names = df['f_name']
def get_PDF_url(url):
r = requests.get(url);r.encoding = 'utf-8'; html = r.text
r.close() # 已获取html内容,结束connection
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
a = p.search(html) # 因第一个<a>即是目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href # 形成完整的链接
return((href,fname))
hrefs=[];fnames=[]
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
time.sleep(10)
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links.to_csv('饮料links.csv')
df_final_links=pd.read_csv('C:/Users/这夏/Documents/python/11companies/饮料links.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
href=hrefs[i];f_name=f_names[i]
r = requests.get(href, allow_redirects=True)
open('%s' %f_name,'wb').write(r.content)
time.sleep(10)
r.close()
■提取所有年报指定页数图表
import pdfplumber
pdf = pdfplumber.open("C:/Users/这夏/Documents/python/10companies/承德露露:2017年年度报告.pdf")
first_page = pdf.pages[6]
table = first_page.extract_table()
table
[['', '2017年', '2016年', '本年比上年增减', '2015年'], ['营业收入(元)', '2,111,873,347.17', '2,520,897,586.14', '-16.23%', '2,706,238,122.26'], ['归属于上市公司股东的净利润(元)', '413,597,862.00', '450,367,055.47', '-8.16%', '463,234,690.64'], ['归属于上市公司股东的扣除非经常性', '', '', '', ''], [None, '413,856,259.15', '453,047,262.55', '-8.65%', '456,025,838.47'], ['损益的净利润(元)', None, None, None, None], [None, '', '', '', ''], ['经营活动产生的现金流量净额(元)', '148,785,503.78', '831,482,030.70', '-82.11%', '795,629,027.23'], ['基本每股收益(元/股)', '0.42', '0.46', '-8.70%', '0.47'], ['稀释每股收益(元/股)', '0.42', '0.46', '-8.70%', '0.47'], ['加权平均净资产收益率', '21.52%', '24.83%', '减少3.31百分点', '31.16%'], ['', '2017年末', '2016年末', '本年末比上年末增减', '2015年末'], ['总资产(元)', '2,811,642,070.79', '3,096,330,515.89', '-9.19%', '2,491,426,927.93'], ['归属于上市公司股东的净资产(元)', '1,998,366,943.71', '1,976,194,172.91', '1.12%', '1,676,375,229.44']]
import pdfplumber
pdf = pdfplumber.open("C:/Users/这夏/Documents/python/10companies/承德露露:2018年年度报告.pdf")
first_page = pdf.pages[6]
table = first_page.extract_table()
table
[['', '2018年', '2017年', '本年比上年增减', '2016年'], ['营业收入(元)', '2,121,966,609.34', '2,111,873,347.17', '0.48%', '2,520,897,586.14'], ['归属于上市公司股东的净利润(元)', '413,057,313.39', '413,597,862.00', '-0.13%', '450,367,055.47'], ['归属于上市公司股东的扣除非经常性损', '', '', '', ''], [None, '413,301,063.71', '413,856,259.15', '-0.13%', '453,047,262.55'], ['益的净利润(元)', None, None, None, None], [None, '', '', '', ''], ['经营活动产生的现金流量净额(元)', '522,954,226.30', '148,785,503.78', '251.48%', '831,482,030.70'], ['基本每股收益(元/股)', '0.42', '0.42', '0.00%', '0.46'], ['稀释每股收益(元/股)', '0.42', '0.42', '0.00%', '0.46'], ['加权平均净资产收益率', '21.99%', '21.52%', '0.47%', '24.83%'], ['', '2018年末', '2017年末', '本年末比上年末增减', '2016年末'], ['总资产(元)', '2,842,852,497.72', '2,811,642,070.79', '1.11%', '3,096,330,515.89'], ['归属于上市公司股东的净资产(元)', '1,922,142,893.10', '1,998,366,943.71', '-3.81%', '1,976,194,172.91']]
import pdfplumber
pdf = pdfplumber.open("C:/Users/这夏/Documents/python/10companies/承德露露:2019年年度报告.pdf")
first_page = pdf.pages[6]
table = first_page.extract_table()
table
[['', '2019年', '2018年', '本年比上年增减', '2017年'], ['营业收入(元)', '2,255,394,058.97', '2,121,966,609.34', '6.29%', '2,111,873,347.17'], ['归属于上市公司股东的净利润(元)', '464,868,495.52', '413,057,313.39', '12.54%', '413,597,862.00'], ['归属于上市公司股东的扣除非经常\n性损益的净利润(元)', '465,329,771.17', '413,301,063.71', '12.59%', '413,856,259.15'], ['经营活动产生的现金流量净额(元)', '675,423,689.39', '522,954,226.30', '29.16%', '148,785,503.78'], ['基本每股收益(元/股)', '0.47', '0.42', '11.90%', '0.42'], ['稀释每股收益(元/股)', '0.47', '0.42', '11.90%', '0.42'], ['加权平均净资产收益率', '24.55%', '21.99%', '增加2.56个百分点', '21.52%'], ['', '2019年末', '2018年末', '本年末比上年末增减', '2017年末'], ['总资产(元)', '3,105,501,655.25', '2,842,852,497.72', '9.24%', '2,811,642,070.79'], ['归属于上市公司股东的净资产(元)', '1,995,586,297.42', '1,922,142,893.10', '3.82%', '1,998,366,943.71']]
import pdfplumber
pdf = pdfplumber.open("C:/Users/这夏/Documents/python/10companies/承德露露:2020年年度报告.pdf")
first_page = pdf.pages[6]
table = first_page.extract_table()
table
[['', '2020年', '2019年', '本年比上年增减', '2018年'], ['营业收入(元)', '1,860,643,698.75', '2,255,394,058.97', '-17.50%', '2,121,966,609.34'], ['归属于上市公司股东的净利润(元)', '432,188,575.47', '464,868,495.52', '-7.03%', '413,057,313.39'], ['归属于上市公司股东的扣除非经常\n性损益的净利润(元)', '429,639,051.19', '465,329,771.17', '-7.67%', '413,301,063.71'], ['经营活动产生的现金流量净额(元)', '378,550,364.75', '675,423,689.39', '-43.95%', '522,954,226.30'], ['基本每股收益(元/股)', '0.40', '0.43', '-6.98%', '0.42'], ['稀释每股收益(元/股)', '0.40', '0.43', '-6.98%', '0.42'], ['加权平均净资产收益率', '20.90%', '24.55%', '-3.65%', '21.99%'], ['', '2020年末', '2019年末', '本年末比上年末增减', '2018年末'], ['总资产(元)', '3,092,933,440.81', '3,105,501,655.25', '-0.40%', '2,842,852,497.72'], ['归属于上市公司股东的净资产(元)', '2,201,387,767.44', '1,995,586,297.42', '10.31%', '1,922,142,893.10']]
■例1:以承德露露公司为例,选取营业收入这一指标,画出2015-2020年度统计图
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 16
x = np.array( [2015,2016,2017,2018,2019,2020])
y = np.array( [2706238122.26,2520897586.14,2111873347.17,2121966609.34,2255394058.97,1860643698.75])
plt.plot(x,y,color='red',ls='--',label='y=x^2')
plt.xlabel('年份', fontsize=14)
plt.ylabel('营业收入(十亿元)', fontsize=14)
plt.title('2015-2020年度公司营业收入', fontsize=18)
plt.show()
data=[2706238122.26,2520897586.14,2111873347.17,2121966609.34,2255394058.97,1860643698.75]
sign =np.arange(2015, 2021)
plt.bar(sign, height=data)
plt.ylim([1500000000, 2900000000])
plt.xlabel('年份', fontsize=14)
plt.ylabel('营业收入(十亿元)', fontsize=14)
plt.title('2015-2020年度公司营业收入', fontsize=18)
Text(0.5, 1.0, '2015-2020年度公司营业收入')
■例2:输出古越龙山与海南椰岛2008-2018年总资产直方图
position = np.arange(2008, 2019)
data1 = [1831208365.67,2371715527.33,2829324821.35,3094070129.54,3586446377.00,3588638221.04,4327721680.42,4405694773.80,4567329385.52,4643168284.08,4802066501.49]
data2 = [1034148372.68,1332730651.20,1298352365.45,1397549026.55,1669325931.87,1423635020.15,1416827999.83,1331336773.68,1741092523.07,1928741261.16,0]
w=0.3
plt.bar(position, data1, width=w,label='古越龙山')
plt.bar(position + w, data2, width=w,label='海南椰岛')
plt.xlabel('年份', fontsize=14)
plt.ylabel('总资产(十亿元)', fontsize=14)
plt.title('2008-2018年度公司总资产', fontsize=18)
plt.legend()
<matplotlib.legend.Legend at 0x273729136d0>
■例3:输出莫高股份与张裕A 2007-2017年度总资产直方图
position = np.arange(2007, 2018)
data1 = [809742022.64,1141422910.86,1112113251.14,1149676535.40,1216397893.37,1288750686.27,1219007908.45,1263709450.93,1276029745.87,1310386191.42,1307448185.08]
data2 = [3251224474,4060932580,5364160798,5983377253,7295944221,8123134580,7997930542,8912232640,10344211461,11528077971,12536755208]
w=0.3
plt.bar(position, data1, width=w,label='莫高股份')
plt.bar(position + w, data2, width=w,label='张裕A')
plt.xlabel('年份', fontsize=14)
plt.ylabel('总资产(十亿元)', fontsize=14)
plt.title('2007-2017年度公司总资产', fontsize=18)
plt.legend()
<matplotlib.legend.Legend at 0x27374eb04f0>
■例4:输出深深宝A与西藏发展2007-2017年度总资产直方图
position = np.arange(2007, 2018)
data1 = [468074812.84,531942477.80,623356758.00,658468839.95,1061964234.64,1222994595.75,1126831157.86,1154612267.59,1060458757.46,1178543725.30,1070386220.55]
data2 = [934183497.06,1075228609.68,1165078085.37,1189791897.51,1116924111.74,1149008697.24,1354405152.39,1378077552.04,1423835915.52,1401078354.10,1469337293.77]
w=0.3
plt.bar(position, data1, width=w,label='深深宝A')
plt.bar(position + w, data2, width=w,label='西藏发展')
plt.xlabel('年份', fontsize=14)
plt.ylabel('总资产(十亿元)', fontsize=14)
plt.title('2007-2017年度公司总资产', fontsize=18)
plt.legend()
<matplotlib.legend.Legend at 0x27374f52610>
■以伊利实业集团为例,提取指定页面图表,其他公司亦同
import pdfplumber
pdf = pdfplumber.open("C:/Users/这夏/Documents/python/10companies/600887内蒙古伊利实业集团股份有限公司2020年年度报告.pdf")
first_page = pdf.pages[5]
table = first_page.extract_table()
table
[['主要会计数据', '2020年', '2019年', '本期比上年同期增减(%)', '2018年'], ['营业收入', '96,523,963,249.92', '90,009,132,852.26', '7.24', '78,976,388,687.29'], ['归属于上市公司股东的净利润', '7,078,176,787.81', '6,933,763,430.47', '2.08', '6,439,749,610.82'], ['归属于上市公司股东的扣除非\n经常性损益的净利润', '6,625,112,858.43', '6,268,158,933.70', '5.69', '5,878,050,473.25'], ['经营活动产生的现金流量净额', '9,851,639,164.55', '8,455,480,282.43', '16.51', '8,624,771,799.80'], ['', '2020年末', '2019年末', '本期末比上年同期末增减(%)', '2018年末'], ['归属于上市公司股东的净资产', '30,383,910,257.46', '26,131,025,017.47', '16.28', '27,915,583,839.88'], ['总资产', '71,154,264,385.29', '60,461,267,016.42', '17.69', '47,606,204,460.43']]
■例5:取十家公司画出2019-2020年度公司总资产堆积柱形图
position = np.arange(1, 11)
data1 = [6719137045.72, 13373475395.64, 20309910295,71154264385.29,15066726493.73,4737859938.11,3080558015.83,2418014632.76,4034293142.99,6058748582.89]
w = 1
plt.bar(position, data1,
width=w , color='orange',
edgecolor='r', hatch='/',
tick_label=['维维股份', '三元股份', '光明乳业','伊利实业','养元饮品','香飘飘','庄园牧场','ST科迪','贝因美','皇氏集团',]
,label='2020')
data2 = [8653112696.05, 13387783922.24, 17637106805,60461267016.42,15110238657.01,3915618535.94,2492726974.51,3424574203.72,4375965959.55,4945626395.91]
plt.bar(position, data2, width=w ,
bottom=data1,
edgecolor='w', hatch='x', label='2019')
plt.legend()
plt.xticks(fontsize=7)
#plt.xlabel('年份', fontsize=14)
plt.ylabel('总资产(十亿元)', fontsize=14)
plt.title('2019与2020年度各公司总资产', fontsize=18)
Text(0.5, 1.0, '2019与2020年度各公司总资产')
■例6:取六家公司画出2016-2017年度各公司总资产堆积柱形图
position = np.arange(1, 7)
data1 = [4567329385.52,1741092523.07,1310386191.42,11528077971,1178543725.30,1401078354.10]
w = 0.8
plt.bar(position, data1,
width=w , color='orange',
edgecolor='r', hatch='/',
tick_label=['古越龙山', '海南椰岛', '莫高股份','张裕A','深深宝A','西藏发展'] ,label='2016')
data2 = [4643168284.08, 1928741261.16, 1307448185.08,12536755208 ,1070386220.55,1469337293.77]
plt.bar(position, data2, width=w ,
bottom=data1,
edgecolor='w', hatch='x', label='2017')
plt.legend()
plt.xticks(fontsize=12)
#plt.xlabel('年份', fontsize=14)
plt.ylabel('总资产(十亿元)', fontsize=14)
plt.title('2016与2017年度各公司总资产', fontsize=18)
Text(0.5, 1.0, '2016与2017年度各公司总资产')
import pdfplumber
pdf = pdfplumber.open("C:/Users/这夏/Documents/python/10companies/600887内蒙古伊利实业集团股份有限公司2020年年度报告.pdf")
first_page = pdf.pages[5]
table = first_page.extract_table()
table
[['主要会计数据', '2020年', '2019年', '本期比上年同期增减(%)', '2018年'], ['营业收入', '96,523,963,249.92', '90,009,132,852.26', '7.24', '78,976,388,687.29'], ['归属于上市公司股东的净利润', '7,078,176,787.81', '6,933,763,430.47', '2.08', '6,439,749,610.82'], ['归属于上市公司股东的扣除非\n经常性损益的净利润', '6,625,112,858.43', '6,268,158,933.70', '5.69', '5,878,050,473.25'], ['经营活动产生的现金流量净额', '9,851,639,164.55', '8,455,480,282.43', '16.51', '8,624,771,799.80'], ['', '2020年末', '2019年末', '本期末比上年同期末增减(%)', '2018年末'], ['归属于上市公司股东的净资产', '30,383,910,257.46', '26,131,025,017.47', '16.28', '27,915,583,839.88'], ['总资产', '71,154,264,385.29', '60,461,267,016.42', '17.69', '47,606,204,460.43']]
■以饮料行业中龙头企业为例,将伊利实业集团2020年度主要会计数据做饼图分析
rate = [96523963249.92, 71154264385.29, 7078176787.81, 9851639164.55,30383910257.46]
labels = ['营业收入', '总资产', '净利润', '现金流量净额','净资产']
plt.figure(figsize=(6,9))
explode = (0.05, 0.05, 0.1, 0.1,0.05)
patches, ltext, ptext = plt.pie(rate, explode=explode, labels=labels,autopct='%.1f%%', shadow=False,startangle=90)
for x in ltext:
x.set_size(20)
for x in ptext:
x.set_size(24)
plt.axis('equal')
#plt.legend()
plt.title('伊利实业集团股份有限公司2020年主要会计数据', fontsize=22)
Text(0.5, 1.0, '伊利实业集团股份有限公司2020年主要会计数据')
■2020年,新冠肺炎疫情突发并肆意蔓延,全球经济环境复杂多变,给企业经营、发展带来诸多困难和挑战。内蒙古伊利实业集团积极防控疫情,开展捐赠和救援行动,充分体现出行业龙头的责任和担当,继续以满足消费者需求为目的,坚守“伊利即品质”信条,通过一系列举措,进一步深化创新能力,提升品牌形象,持续培育并拓展新业务。2020年公司新品销售收入占比 16.0%,重点产品销售收入比上年同期增长 9.6%。液体乳业务零售额比上年同期增长 12.7%,其中有机液体乳零售额比上年同期增长 35.1%,市占份额达 50.6%,在对应的细分市场中位居首位。 随着大众健康意识提升、消费升级及消费场景多元化,乳制品受到越来越多的关注,消费者对各种饮料生产的需求在持续稳定地增加,中国饮料生产也在不断地发展和壮大,饮料生产产品种类日趋多样化,产量逐年提高,市场整体呈“健康化”发展态势。 乳制品及健康饮品属于大众日常消费品,行业周期性特征不明显。延续2021年热度,食品饮料行业2022年依然是投资领域的热点。我认为主要原因有如下几个方面:首先是疫情以来,饮品行业新消费场景不断涌现,很多饮品企业契合这些需求,实现了快速成长。其次,近几年来食品饮料板块整体业绩向好,推动估值先行。最后,在整体经济环境偏弱的情况下,食品饮料有较好的抗风险性。据公开数据显示,2020年底中国茶饮市场总规模达到4420亿元,咖啡市场总规模为2155亿元。2021年,国内食品饮料领域更是发生多起投融资事件,蜜雪冰城获得了20亿美元融资。奈雪的茶上市,喜茶,沪上阿姨等品牌连获融资高峰,高瓴资本,红杉资本,字节跳动,三七互娱等投资巨头和互联网巨头纷纷入局。据不完全统计,2020年-2021年上半年无糖饮料融资事件共36起,无糖饮料行业融资案例增加非常明显,行业热度迅速上升。这得益于减糖,减盐,减脂成为中国食品饮料行业的结构性趋势。国民对健康的重视随着经济水平的提高愈加强烈,突如其来的疫情增强了这一趋势。