In [1]:
import pandas as pd
import numpy as np
import openpyxl
import re
import requests
import time
import os
import fitz
In [2]:
xlsx = '新能源行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)

p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)

df1 = pd.DataFrame({'Link': [t[0] for t in list_of_tuple],
                    'f_name': [t[1] for t in list_of_tuple]})
df1.to_excel('新能源行业_final.xlsx')
print(df1.head(10))
                                                Link                 f_name
0  http://news.windin.com/ns/bulletin.php?code=81...  孚能科技:2020年年度报告全文(修订版)
1  http://news.windin.com/ns/bulletin.php?code=4E...     亿华通:2020年年度报告(修订版)
2  http://news.windin.com/ns/bulletin.php?code=5F...    鸿达兴业:2020年年度报告(更新后)
3  http://news.windin.com/ns/bulletin.php?code=60...  鸿达兴业:2020年年度报告摘要(更新后)
4  http://news.windin.com/ns/bulletin.php?code=F1...  孚能科技:2020年年度报告全文(修订版)
5  http://news.windin.com/ns/bulletin.php?code=E7...  孚能科技:2020年年度报告摘要(修订版)
6  http://news.windin.com/ns/bulletin.php?code=49...         方正电机:2020年年度报告
7  http://news.windin.com/ns/bulletin.php?code=3A...       方正电机:2020年年度报告摘要
8  http://news.windin.com/ns/bulletin.php?code=D2...       孚能科技:2020年年度报告摘要
9  http://news.windin.com/ns/bulletin.php?code=CB...       孚能科技:2020年年度报告全文
In [4]:
df2=pd.read_excel(r'C:\Users\pc\Desktop\刘宸虎\新能源行业_final.xlsx')
links=df2['Link'];f_names=df1['f_name']
p=re.compile('(?<=\d{4})(年报)|(年年报)')
f_names=[p.sub('年年度报告',f) for f in df2.f_name]
df2['f_name']=f_names;del p,f_names
print(df2)
     Unnamed: 0                                               Link  \
0             0  http://news.windin.com/ns/bulletin.php?code=81...   
1             1  http://news.windin.com/ns/bulletin.php?code=4E...   
2             2  http://news.windin.com/ns/bulletin.php?code=5F...   
3             3  http://news.windin.com/ns/bulletin.php?code=60...   
4             4  http://news.windin.com/ns/bulletin.php?code=F1...   
..          ...                                                ...   
182         182  http://news.windin.com/ns/bulletin.php?code=9B...   
183         183  http://news.windin.com/ns/bulletin.php?code=95...   
184         184  http://news.windin.com/ns/bulletin.php?code=BB...   
185         185  http://news.windin.com/ns/bulletin.php?code=B1...   
186         186  http://news.windin.com/ns/bulletin.php?code=1C...   

                    f_name  
0    孚能科技:2020年年度报告全文(修订版)  
1       亿华通:2020年年度报告(修订版)  
2      鸿达兴业:2020年年度报告(更新后)  
3    鸿达兴业:2020年年度报告摘要(更新后)  
4    孚能科技:2020年年度报告全文(修订版)  
..                     ...  
182         京城股份:2019年年度报告  
183       京城股份:2019年年度报告摘要  
184          贝斯特:2019年年度报告  
185        贝斯特:2019年年度报告摘要  
186    大洋电机:2018年年度报告(更新后)  

[187 rows x 3 columns]
In [5]:
def filter_links(words,df,include=True):
    ls=[]
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    index=[]
    for r in range(len(df)):
        flag=not include
        for c in range(len(words)):
            if include:
                flag = flag or ls[c][r]
            else:
                flag = flag and ls[c][r]
        index.append(flag)
    df3 = df[index]
    return(df3)
In [6]:
df_all=filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df1,include=False)
df_orig=filter_links(['(','(',],df_all,include=False)
df_updt=filter_links(['(','(',],df_all,include=True)
df_updt=filter_links(['取消'],df_updt,include=False)
print(df_orig,df_updt)
                                                  Link            f_name
6    http://news.windin.com/ns/bulletin.php?code=49...    方正电机:2020年年度报告
9    http://news.windin.com/ns/bulletin.php?code=CB...  孚能科技:2020年年度报告全文
11   http://news.windin.com/ns/bulletin.php?code=6B...    卧龙电驱:2020年年度报告
12   http://news.windin.com/ns/bulletin.php?code=C6...    汉钟精机:2020年年度报告
14   http://news.windin.com/ns/bulletin.php?code=0A...   亿华通:2020年年度报告全文
..                                                 ...               ...
175  http://news.windin.com/ns/bulletin.php?code=97...    安泰科技:2019年年度报告
177  http://news.windin.com/ns/bulletin.php?code=1C...     科达利:2019年年度报告
179  http://news.windin.com/ns/bulletin.php?code=35...     英搏尔:2019年年度报告
182  http://news.windin.com/ns/bulletin.php?code=9B...    京城股份:2019年年度报告
184  http://news.windin.com/ns/bulletin.php?code=BB...     贝斯特:2019年年度报告

[86 rows x 2 columns]                                                   Link                 f_name
0    http://news.windin.com/ns/bulletin.php?code=81...  孚能科技:2020年年度报告全文(修订版)
1    http://news.windin.com/ns/bulletin.php?code=4E...     亿华通:2020年年度报告(修订版)
2    http://news.windin.com/ns/bulletin.php?code=5F...    鸿达兴业:2020年年度报告(更新后)
4    http://news.windin.com/ns/bulletin.php?code=F1...  孚能科技:2020年年度报告全文(修订版)
93   http://news.windin.com/ns/bulletin.php?code=6B...     奥特迅:2019年年度报告(更新后)
94   http://news.windin.com/ns/bulletin.php?code=AF...    鸿达兴业:2019年年度报告(更新后)
95   http://news.windin.com/ns/bulletin.php?code=AF...    鸿达兴业:2019年年度报告(更新后)
98   http://news.windin.com/ns/bulletin.php?code=6A...     特锐德:2019年年度报告(更新后)
100  http://news.windin.com/ns/bulletin.php?code=02...    滨化股份:2019年年度报告(修订版)
101  http://news.windin.com/ns/bulletin.php?code=9A...    金冠股份:2019年年度报告(更新后)
186  http://news.windin.com/ns/bulletin.php?code=1C...    大洋电机:2018年年度报告(更新后)
In [7]:
def sub_with_update(df_updt,df_orig):
    index_orig=[];i=0
    index_updt=[];j=0
    for i,f in enumerate(df_orig):
        for j,fn in enumerate(df_updt):
            if f in fn:
                index_orig.append(i)
                index_updt.append(j)
    for n in range(len(index_orig)):
        i=index_orig[n]
        j=index_updt[n]
        df_orig.iloc[i,-2]=df_updt.iloc[j,-2]
    return(df_orig)

df_newest=sub_with_update(df_updt,df_orig)

df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
print(df_newest)
                                                 Link           f_name
0   http://news.windin.com/ns/bulletin.php?code=47...  *ST京城:2020年年度报告
1   http://news.windin.com/ns/bulletin.php?code=15...  *ST江特:2020年年度报告
2   http://news.windin.com/ns/bulletin.php?code=8F...   ST电能:2020年年度报告
3   http://news.windin.com/ns/bulletin.php?code=FD...   中泰股份:2019年年度报告
4   http://news.windin.com/ns/bulletin.php?code=D3...   中泰股份:2020年年度报告
..                                                ...              ...
81  http://news.windin.com/ns/bulletin.php?code=1D...   雪人股份:2020年年度报告
82  http://news.windin.com/ns/bulletin.php?code=21...   鸿达兴业:2019年年度报告
83  http://news.windin.com/ns/bulletin.php?code=3A...   鸿达兴业:2020年年度报告
84  http://news.windin.com/ns/bulletin.php?code=6D...   鹏辉能源:2019年年度报告
85  http://news.windin.com/ns/bulletin.php?code=E6...   鹏辉能源:2020年年度报告

[86 rows x 2 columns]
D:\anaconda\lib\site-packages\pandas\core\indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)

D:\anaconda\lib\site-packages\ipykernel_launcher.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':

D:\anaconda\lib\site-packages\ipykernel_launcher.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [8]:
df_newest['公司简称']=[f[:4] for f in df_newest.f_name]
counts=df_newest['公司简称'].value_counts()
ten_company=[]
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn],df_newest))
print(ten_company)
[                                                 Link          f_name  公司简称
20  http://news.windin.com/ns/bulletin.php?code=EE...  国轩高科:2019年年度报告  国轩高科
21  http://news.windin.com/ns/bulletin.php?code=EE...  国轩高科:2019年年度报告  国轩高科
22  http://news.windin.com/ns/bulletin.php?code=6B...  国轩高科:2020年年度报告  国轩高科,                                                  Link         f_name  公司简称
41  http://news.windin.com/ns/bulletin.php?code=BF...  格林美:2019年年度报告  格林美:
42  http://news.windin.com/ns/bulletin.php?code=BF...  格林美:2019年年度报告  格林美:
43  http://news.windin.com/ns/bulletin.php?code=07...  格林美:2020年年度报告  格林美:,                                                  Link          f_name  公司简称
24  http://news.windin.com/ns/bulletin.php?code=25...  大洋电机:2019年年度报告  大洋电机
25  http://news.windin.com/ns/bulletin.php?code=25...  大洋电机:2019年年度报告  大洋电机
26  http://news.windin.com/ns/bulletin.php?code=BD...  大洋电机:2020年年度报告  大洋电机,                                                  Link          f_name  公司简称
12  http://news.windin.com/ns/bulletin.php?code=C4...  冰轮环境:2019年年度报告  冰轮环境
13  http://news.windin.com/ns/bulletin.php?code=12...  冰轮环境:2020年年度报告  冰轮环境,                                                  Link          f_name  公司简称
35  http://news.windin.com/ns/bulletin.php?code=60...  容百科技:2019年年度报告  容百科技
36  http://news.windin.com/ns/bulletin.php?code=EB...  容百科技:2020年年度报告  容百科技,                                                  Link          f_name  公司简称
31  http://news.windin.com/ns/bulletin.php?code=57...  宁德时代:2019年年度报告  宁德时代
32  http://news.windin.com/ns/bulletin.php?code=E0...  宁德时代:2020年年度报告  宁德时代,                                                  Link          f_name  公司简称
33  http://news.windin.com/ns/bulletin.php?code=97...  安泰科技:2019年年度报告  安泰科技
34  http://news.windin.com/ns/bulletin.php?code=65...  安泰科技:2020年年度报告  安泰科技,                                                  Link          f_name  公司简称
59  http://news.windin.com/ns/bulletin.php?code=82...  盛弘股份:2019年年度报告  盛弘股份
60  http://news.windin.com/ns/bulletin.php?code=BB...  盛弘股份:2020年年度报告  盛弘股份,                                                  Link          f_name  公司简称
80  http://news.windin.com/ns/bulletin.php?code=1B...  雪人股份:2019年年度报告  雪人股份
81  http://news.windin.com/ns/bulletin.php?code=1D...  雪人股份:2020年年度报告  雪人股份,                                                  Link          f_name  公司简称
67  http://news.windin.com/ns/bulletin.php?code=31...  蓝海华腾:2019年年度报告  蓝海华腾
68  http://news.windin.com/ns/bulletin.php?code=51...  蓝海华腾:2020年年度报告  蓝海华腾]
D:\anaconda\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

In [9]:
def get_PDF_url(url):
    r=requests.get(url);r.encoding='utf-8';html=r.text
    r.close()
    p=re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
    a=p.search(html)
    if a is None:
        Warning('没有找到下载链接,请手动检查链接:%s' % url)
        return()
    else:
        href=a.group(1);fname=a.group(2).strip()
    href=r.url[:26]+href
    return((href,fname))
In [10]:
for n in range(len(ten_company)):
    df_One_Co = pd.DataFrame({'Link':[h for h in ten_company[n]['Link']],
                    'f_name':[f for f in ten_company[n]['f_name']]})
    for i in range(len(df_One_Co)):
        href,fname=get_PDF_url(df_One_Co.Link[i])
        r=requests.get(href,allow_redirects=True)
        open('%s'%fname,'wb').write(r.content)
        time.sleep(10)
    r.close()
In [11]:
filenames = os.listdir()
print(filenames)
['.ipynb_checkpoints', '688005容百科技2019年年度报告.pdf', '688005容百科技2020年年度报告.pdf', 'project.ipynb', '冰轮环境:2019年年度报告.pdf', '冰轮环境:2020年年度报告.pdf', '国轩高科:2019年年度报告.pdf', '国轩高科:2020年年度报告.pdf', '大洋电机:2019年年度报告.pdf', '大洋电机:2020年年度报告.pdf', '宁德时代:2019年年度报告.pdf', '宁德时代:2020年年度报告.pdf', '安泰科技:2019年年度报告.pdf', '安泰科技:2020年年度报告.pdf', '新能源行业.xlsx', '新能源行业_final.xlsx', '格林美:2019年年度报告.pdf', '格林美:2020年年度报告.pdf', '盛弘股份:2019年年度报告.pdf', '盛弘股份:2020年年度报告.pdf', '蓝海华腾:2019年年度报告.pdf', '蓝海华腾:2020年年度报告.pdf', '雪人股份:2019年年度报告.pdf', '雪人股份:2020年年度报告.pdf']
In [12]:
for i in range(len(ten_company)):
    prefix= ten_company[i].iloc[1,-1]
    print(prefix)
    pdf = [f for f in filenames if prefix[:3] in f and f.endswith('.pdf')]
    print(pdf)
    year = [p[-13:-9] for p in pdf]
    print(year)
    
国轩高科
['国轩高科:2019年年度报告.pdf', '国轩高科:2020年年度报告.pdf']
['2019', '2020']
格林美:
['格林美:2019年年度报告.pdf', '格林美:2020年年度报告.pdf']
['2019', '2020']
大洋电机
['大洋电机:2019年年度报告.pdf', '大洋电机:2020年年度报告.pdf']
['2019', '2020']
冰轮环境
['冰轮环境:2019年年度报告.pdf', '冰轮环境:2020年年度报告.pdf']
['2019', '2020']
容百科技
['688005容百科技2019年年度报告.pdf', '688005容百科技2020年年度报告.pdf']
['2019', '2020']
宁德时代
['宁德时代:2019年年度报告.pdf', '宁德时代:2020年年度报告.pdf']
['2019', '2020']
安泰科技
['安泰科技:2019年年度报告.pdf', '安泰科技:2020年年度报告.pdf']
['2019', '2020']
盛弘股份
['盛弘股份:2019年年度报告.pdf', '盛弘股份:2020年年度报告.pdf']
['2019', '2020']
雪人股份
['雪人股份:2019年年度报告.pdf', '雪人股份:2020年年度报告.pdf']
['2019', '2020']
蓝海华腾
['蓝海华腾:2019年年度报告.pdf', '蓝海华腾:2020年年度报告.pdf']
['2019', '2020']
In [13]:
def getText(pdf):
    text = ''
    doc = fitz.open(pdf)
    for page in doc:
        text += page.getText()
    doc.close()
    return(text)
    
def parse_data_line(pdf):    
    text = getText(pdf)
    p1 = re.compile('\w{1,2}、主要会计数据和财务指标(.*?)(?=\w{1,2}、)',re.DOTALL)
    subtext = p1.search(text)
    if subtext is None:
        p1 = re.compile('(\w{1,2})\s*主要会计数据(.*?)(?=(\w{1,2})\s*主要财务指标)',re.DOTALL)
        subtext = p1.search(text).group(0)
    else:
        subtext = p1.search(text).group(0)
    subp='([0-9,.%\- ]*?)\n' and '([0-9,.%\- ]*?)\s'
    psub='%s%s%s%s'%(subp,subp,subp,subp)
    p=re.compile('(\D+\n)+%s'%psub)
    lines=p.findall(subtext)    
    return(lines)
In [14]:
filenames = os.listdir()
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif']=['SimHei']  #确保显示中文
plt.rcParams['axes.unicode_minus'] = False  #确保显示负数的参数设置
In [15]:
for i in range(5):
    prefix= ten_company[-i].iloc[1,-1]
    print(prefix)
    pdf = [f for f in filenames if prefix[:3] in f and f.endswith('.pdf')]
    year = [p[-13:-9] for p in pdf]
    df_data=pd.DataFrame({'年份':year,
                      '年营业收入':''})
    for y in range(len(pdf)):
        lines=parse_data_line(pdf[y])
        df_fnc=pd.DataFrame([l for l in lines],columns=['',year[y]+'年',str(eval(year[y])-1)+'年','本年比上年增减',str(eval(year[y])-2)+'年']) 
        #df_fnc.to_excel('%s%s财务数据.xls'%(prefix,year[y]))
        s=df_fnc.iloc[0,1]
        s=s.replace(',','')
        df_data['年营业收入'][[y]]=eval(s)
    print(df_data)
    plt.figure()
    plt.plot(df_data['年份'],df_data['年营业收入'],label=u'年营业收入',color='#FF8247')
    for x,y in zip(df_data['年份'],df_data['年营业收入']):#显示bar数值
        plt.text(x,y,'%.3e'%y,ha='center',va='bottom')
    plt.xlabel(u'(年)',fontsize=13)
    plt.ylabel(u'年营业收入(元)',fontsize=13,rotation=90)
    plt.legend(loc='best')
    plt.title(u'%s%s-%s年营业收入的可视化'%(prefix,str(year[0]),str(year[-1])),fontsize=13)
    plt.yticks(range(0,10**10,10**9))
    plt.show()
国轩高科
     年份        年营业收入
0  2019   4.9589e+09
1  2020  6.72423e+09
D:\anaconda\lib\site-packages\matplotlib\cbook\__init__.py:2064: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x[:, None]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:248: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x = x[:, np.newaxis]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:250: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  y = y[:, np.newaxis]

蓝海华腾
     年份        年营业收入
0  2019  3.20088e+08
1  2020  4.00701e+08
D:\anaconda\lib\site-packages\matplotlib\cbook\__init__.py:2064: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x[:, None]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:248: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x = x[:, np.newaxis]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:250: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  y = y[:, np.newaxis]

雪人股份
     年份        年营业收入
0  2019  1.51357e+09
1  2020  1.45838e+09
D:\anaconda\lib\site-packages\matplotlib\cbook\__init__.py:2064: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x[:, None]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:248: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x = x[:, np.newaxis]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:250: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  y = y[:, np.newaxis]

盛弘股份
     年份        年营业收入
0  2019  6.35845e+08
1  2020  7.71355e+08
D:\anaconda\lib\site-packages\matplotlib\cbook\__init__.py:2064: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x[:, None]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:248: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x = x[:, np.newaxis]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:250: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  y = y[:, np.newaxis]

安泰科技
     年份        年营业收入
0  2019  4.78022e+09
1  2020  4.97915e+09
D:\anaconda\lib\site-packages\matplotlib\cbook\__init__.py:2064: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x[:, None]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:248: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  x = x[:, np.newaxis]

D:\anaconda\lib\site-packages\matplotlib\axes\_base.py:250: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  y = y[:, np.newaxis]

In [ ]: