python抓取某浪申万Ⅰ级分类数据并保存到文本文件和PostgreSQL

抓取申万Ⅰ级分类板块数据并保存到文本文件和数据库PostgreSQL的应用示例，并附有完整代码。抓取申万Ⅰ级分类板块数据，有些板块下的个股数量很多，需要分页处理。内容包括个股的申万Ⅲ级分类代码、申万Ⅲ级分类名称、个股代码、个股名称、总市值、流通市值、最新价和换手率，数据格式如下：

shw1_code,category_name,category_mktcode,stock_code,stock_name,stock_changepercent,stock_mktcap,stock_nmc,stock_hsl
sw1_770000,美容护理,bj832982,832982,锦波生物,1.836,1393652.33,486312.51,0.77862
sw1_770000,美容护理,sh600223,600223,福瑞达,-0.625,808172.18,808172.18,1.84122
sw1_770000,美容护理,sh600249,600249,两面针,6.076,230450.0,230450.0,3.76191
sw1_770000,美容护理,sh600315,600315,上海家化,0.533,1274681.98,1268971.41,1.83879
......

了解抓取申万Ⅲ级分类数据并保存到文本文件和PostgreSQL的方法，点击这里

代码如下：

import requests
from bs4 import BeautifulSoup
import re
from operator import itemgetter
import time
import random
import pandas as pd

def remove_col(arr, ith):
    itg = itemgetter(*filter((ith).__ne__, range(len(arr[0]))))
    return list(map(list, map(itg, arr))) 

url = 'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodes'

heads = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"}
# url = 'https://www.holidayfrancedirect.co.uk/cottages-holidays/index.htm'

resText = requests.get(url)

soup = BeautifulSoup(resText.content, features='lxml')  
#soup = BeautifulSoup(resText.text, "html.parser")  

s = soup.text

print('\n申万一级分类:')  
shw1 = s[s.find('swhy'):s.find('sw1_hy')]
shw1_cut = shw1[shw1.find('[['):shw1.find(']]')]
shw1_cut = re.sub(r'\[','',shw1_cut)
shw1_cut = re.sub(r'"','',shw1_cut)
shw1_list = shw1_cut.split(']')

shw1_list_split = []
for i in range(0,len(shw1_list)):
    item_split = shw1_list[i].split(',')
    if i == 0:        
        temp_str = item_split[0].encode('utf-8').decode('unicode_escape')
        item_split[0] = temp_str
    else:
        temp_str = item_split[1].encode('utf-8').decode('unicode_escape')
        item_split[1] = temp_str
        item_split = item_split[1:4]    
    shw1_list_split.append(item_split)   

result_shw1 = remove_col(shw1_list_split, 1)
print()
print('申万一级分类总数:',len(result_shw1))
print(result_shw1)
print()

## 申万一级分类及其各分类下的股票, sw2_730100
print('申万一级及其所属股票')
shw1_category_and_stocks = []
shw1_categorystock = []
for i in range(0,len(result_shw1)): #len(result_shw2)
    # print(result_shw2[i][1][1:11])
    # print(i)
    s2 = ''
    page_i = 1
    while True:
        # 实例： https://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page=1&num=500&sort=symbol&asc=1&node=sw1_270000&symbol=&_s_r_a=init
        url1 = 'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page='+str(page_i)+'&num=200&sort=symbol&asc=1&node=' + result_shw1[i][1][0:11] + '&symbol=&_s_r_a=init'   
        #'http://vip.stock.finance.sina.com.cn/mkt/#sw2_730100'
        print(url1,i,result_shw1[i][0],result_shw1[i][1][0:11])
        
        resText2 = requests.get(url1)
        soup2 = BeautifulSoup(resText2.content, features='lxml')  
        #soup = BeautifulSoup(resText.text, "html.parser")  
        # print(soup2)        
        if len(soup2.text) > 10:
            current_s = soup2.text
            #print(s2[1:-1])
            s2 = s2 + current_s # '\n,'+
            page_i = page_i + 1
        else:
            break

    print('------------------------------------------------------')
    #print(s2)
    
    resStr2 = re.sub(r'\[','',s2)
    resStr2 = re.sub(r'\]','',resStr2) 
    resStr2 = re.sub(r'{','',resStr2) 

    resStr2_list = resStr2.split('}')
    resStr2_list.pop() # 删除最后一个元素，由于split产生的空元素    
    
    shw_one_stocks = []    
    for j in range(0, len(resStr2_list)):    
        singlestock_info = resStr2_list[j].split(',')  
        #print('singlestock_info len:',len(singlestock_info),singlestock_info)
        if len(singlestock_info) == 20:        
            rst = [[x for x in ss.split(':')] for ss in singlestock_info]                   
            #print('rst len:',len(rst),rst[0][1]+','+rst[1][1] + ',' + rst[2][1].encode('utf-8').decode('unicode_escape'))                      
            #if not (('bj' in rst[0][1])|('sh68' in rst[0][1])):                
            shw_one_stocks.append([rst[0][1][0:len(rst[0][1])],rst[1][1][0:len(rst[1][1])],rst[2][1][0:len(rst[2][1])].encode('utf-8').decode('unicode_escape')])
            shw1_categorystock.append([result_shw1[i][0][0:len(result_shw1[i][0])],
                                       result_shw1[i][1][0:len(result_shw1[i][1])],
                                       rst[0][1][1:len(rst[0][1])-1],rst[1][1][1:len(rst[1][1])-1],
                                       rst[2][1][1:len(rst[2][1])-1].encode('utf-8').decode('unicode_escape'),
                                       rst[-15][1], # "changepercent", round(float(rst[-15][1]),2)
                                       round(float(rst[-3][1]),2), # 总市值
                                       round(float(rst[-2][1]),2), # 流通市值
                                       rst[-1][1] # 换手率
                                       ])
        else: 
            rst = [[x for x in ss.split(':')] for ss in singlestock_info]                              
            #print('rst len:',len(rst),rst[1][1] +','+rst[2][1] + ',' + rst[3][1].encode('utf-8').decode('unicode_escape'))             
            
            #if not (('bj' in rst[1][1])|('sh68' in rst[1][1])):                     
                #shw_one_stocks.append([rst[1][1], rst[2][1], rst[3][1].encode('utf-8').decode('unicode_escape')])
            shw_one_stocks.append([rst[1][1][0:len(rst[1][1])],rst[2][1][0:len(rst[2][1])],rst[3][1][0:len(rst[3][1])].encode('utf-8').decode('unicode_escape')])
            shw1_categorystock.append([result_shw1[i][0][0:len(result_shw1[i][0])],
                                       result_shw1[i][1][0:len(result_shw1[i][1])], 
                                       rst[1][1][1:len(rst[1][1])-1],rst[2][1][1:len(rst[2][1])-1],
                                       rst[3][1][1:len(rst[3][1])-1].encode('utf-8').decode('unicode_escape'),
                                       rst[-15][1], # "changepercent", round(float(rst[-15][1]),2)
                                       round(float(rst[-3][1]),2), # 总市值
                                       round(float(rst[-2][1]),2),  # 流通市值
                                       rst[-1][1] # 换手率
                                       ])

    tmp_removequotes = [result_shw1[i][0][0:len(result_shw1[i][0])],result_shw1[i][1][0:len(result_shw1[i][1])]]
    shw1_category_and_stocks.append([tmp_removequotes,shw_one_stocks])
    time.sleep(random.randint(1,6)) #防止抓取页面密集而网站被封

# print('==============================================')
for i in range(0,len(shw1_category_and_stocks)):    
    print(shw1_category_and_stocks[i][0])
    print(shw1_category_and_stocks[i][1])
    print()

print()
for i in range(0,len(shw1_categorystock)):    
    print(shw1_categorystock[i])

print()
print('申万一级分类总数：',len(result_shw1))
print('申万一级分类总数(包括各分类的股票)：',len(shw1_categorystock))


shw1_category = [x[0][0] for x in shw1_category_and_stocks] 
shw1_code = [x[0][1] for x in shw1_category_and_stocks] 
dict1 = {'shw1_code': shw1_code,'shw1_category': shw1_category} 
df1 = pd.DataFrame(dict1) 
df1.to_csv('shenwan1_category.csv',index = False)

shw1_category_code =  [x[1] for x in shw1_categorystock] 
shw1_category_name =  [x[0] for x in shw1_categorystock] 
shw1_category_mktcode =  [x[2] for x in shw1_categorystock] 
shw1_stock_code =  [x[3] for x in shw1_categorystock] 
shw1_stock_name =  [x[4] for x in shw1_categorystock] 
shw1_stock_changepercent =  [x[5] for x in shw1_categorystock] 
stock_mktcap = [x[6] for x in shw1_categorystock] 
stock_nmc = [x[7] for x in shw1_categorystock] 
stock_hsl = [x[8] for x in shw1_categorystock] 
dict2 = {'shw1_code': shw1_category_code,'category_name': shw1_category_name,'category_mktcode':shw1_category_mktcode,\
         'stock_code':shw1_stock_code,'stock_name':shw1_stock_name,'stock_changepercent':shw1_stock_changepercent,\
         'stock_mktcap':stock_mktcap,'stock_nmc':stock_nmc,'stock_hsl':stock_hsl}  # 
df2 = pd.DataFrame(dict2) 
df2.to_csv('shenwan1_category_stocks.csv',index = False)

stocks_hsl = list(zip(shw1_category_code,shw1_category_name,shw1_category_mktcode,shw1_stock_code,shw3_stock_name,stock_mktcap,stock_nmc,stock_price,turnoverratio))
updated_date = [datetime.today().strftime('%Y-%m-%d')]*len(shw1_stock_code)

try:
        conn = psycopg2.connect(
            host="localhost",
            database="stockdb",
            user="postgres",
            password="******"
        )
        # 打开允许批量插入的游标
        cur = conn.cursor()
        # 执行批量插入操作（使用忽略唯一键错误的方式）        
        sql = "INSERT INTO stocks_hsl " + \
             "(shw1_code,category_name,category_mktcode,stock_code,stock_name,stock_mktcap,stock_nmc,stock_price,turnoverratio)" + \
             "VALUES (%s, %s,%s, %s,%s, %s,%s, %s,%s) ON CONFLICT DO NOTHING"
         
        
        cur.executemany(sql, stocks_hsl)
        # 提交事务
        conn.commit()
except (Exception, Error) as e:
        print(" 连接 PostgreSQL 时报错！", e)
finally:
        # 关闭游标和连接
        if cur:
            cur.close()
        if conn:
            conn.close()    
else:
    print('本次不保存个股换手率数据到数据库')    

# 数据格式示例：
#  ['"sz000070"', '"000070"', '"特发信息"', '"5.860"', '0.04', '0.687', '"5.850"', '"5.860"', '"5.820"', 
# '"5.810"', '"5.860"', '"5.660"', '8478528', '48999967', '2022-06-14', '-7.808', '2.588', '494924.355832', '487960.883258', '1.0182']

## 字段说明
# "symbol":"sz002281",
# "code":"002281",
# "name":"\u5149\u8fc5\u79d1\u6280",
# "trade":"22.740",
# "pricechange":-0.29,
# "changepercent":-1.259,
# "buy":"22.740",
# "sell":"22.750",
# "settlement":"23.030", 昨日收盘价
# "open":"23.050",
# "high":"23.220",
# "low":"22.670",
# "volume":6874488, 成交量
# "amount":157353968, 成交额
# "ticktime":"15:00:03",
# "per":31.151,
# "pb":2.905, 市净率
# "mktcap":1590455.879532, 总市值
# "nmc":1507701.365214, 流通市值
# "turnoverratio":1.03685， 换手率

python抓取某浪申万Ⅰ级分类数据并保存到文本文件和PostgreSQL

相关推荐

发表回复