从财经门户网抓取A股实时行情含python完整代码

程序化交易股票最大的难点是获取交易时段的实时行情,即使券商为机构提供的交易服务也不许向外部发送实时行情数据。从一些门户网站的财经股票行情抓取实时数据也是一种不得已但可行的办法,只要抓取不频繁,并且在几个网站之间切换,每3、5分钟抓取一次,那几个网站之间轮一遍,此类网站应该能容忍,不会封禁止数据端口。如果交易速度慢,以15分钟以上的K线数据作为信号数据,那一般情况下没有问题,这样的频率门户网站是可以容许的。

国内网站一般有冻财、旧浪、荷讯,外部网站可以从谷哥、雅胡的财经板块抓取,不同网站的行情刷新时间不一样,相比专门的证券软件也有延迟,国内大概2秒刷新一次,延迟大约也是2、3秒,但骨哥亚胡的延迟比较明显,大概要1、2分钟,这类大门后网站对抓取数据封禁条件和严格程度也有差别。

从财经门户网抓取A股实时行情含python完整代码

以下提供的python示例代码,是针对冻财和旧浪的,都是抓取全部A股的实时行情,这样做也容易被网站封禁,可以根据自己的需要指定较少的股票来抓取,也能更多地受门户宽容。

import time
import requests
from bs4 import BeautifulSoup
import json
import schedule
import re
import random

class EastMoneyScraping:
    def __init__(self):
        self.tryMaxTime = 5 # 尝试抓取指定网页的最大次数,超过5次,则报错,本次抓取失败        

    def start(self):
        now = int(time.time())
        date = time.localtime(now)
        dateStr = time.strftime("%Y-%m-%d %H:%M:%S",date)
        t1 = time.perf_counter()
        scraping_result = []
        flag = True #判断这次爬取是否成功
        for index in range(1,11):
            print(" 抓取第 {0} 到 {1} 只股票,时间 {2} ...".format((index-1)*500+1,index*500,dateStr))
            execute_scraping = self.request(index)
            # flag &= self.request(index)
            flag = execute_scraping[0]
            scraping_result = scraping_result + execute_scraping[1]
            time.sleep(random.randint(1,6))
        if flag:            
            print("时间 {0} 爬取成功!\n".format(dateStr))
        else:
            print("时间 {0} 爬取失败!\n".format(dateStr))

        t2 = time.perf_counter()
        print(f'本次抓取完成,耗时:{round(t2-t1,2)} 秒')

        return scraping_result

    def request(self,page):
        dt = int(round(time.time()*1000))
        # 个股行情,换手率排序
        # http://24.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124008724829083132013_1627545626768&pn=1&pz=200&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f8&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1627545626850
        #url = "http://fund.eastmoney.com/Data/Fund_JJJZ_Data.aspx?t=1&lx=1&letter=&gsid=&text=&sort=zdf,desc&page={0},200&dt={1}&atfc=&onlySale=0".format(page,dt)
        
        # 个股资金实时流向
        # url = "http://push2.eastmoney.com/api/qt/clist/get?cb=jQuery112309937869357366578_1617693739231&fid=f62&po=1&pz=100&pn={0}&np=1&fltt=2&invt=2&ut=b2884a393a59ad64002292a3e90d46a5&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A13%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A7%2Bf%3A!2%2Cm%3A1%2Bt%3A3%2Bf%3A!2&fields=f12%2Cf14%2Cf2%2Cf3%2Cf62%2Cf184%2Cf66%2Cf69%2Cf72%2Cf75%2Cf78%2Cf81%2Cf84%2Cf87%2Cf204%2Cf205%2Cf124".format(page)
        # 个股资金实时流向 - 净流入总额排名
        # url = "https://push2.eastmoney.com/api/qt/clist/get?cb=jQuery112306936708934527107_1643082735230&fid=f62&po=1&pz=500&pn={0}&np=1&fltt=2&invt=2&ut=b2884a393a59ad64002292a3e90d46a5&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A13%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A7%2Bf%3A!2%2Cm%3A1%2Bt%3A3%2Bf%3A!2&fields=f12%2Cf14%2Cf2%2Cf3%2Cf62%2Cf184%2Cf66%2Cf69%2Cf72%2Cf75%2Cf78%2Cf81%2Cf84%2Cf87%2Cf204%2Cf205%2Cf124%2Cf1%2Cf13".format(page)
        
        # 个股资金实时流向 - 净占比排名
        url= 'http://push2.eastmoney.com/api/qt/clist/get?cb=jQuery112306936708934527107_1643082735230&fid=f184&po=1&pz=500&pn={0}&np=1&fltt=2&invt=2&ut=b2884a393a59ad64002292a3e90d46a5&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A13%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A7%2Bf%3A!2%2Cm%3A1%2Bt%3A3%2Bf%3A!2&fields=f12%2Cf14%2Cf2%2Cf3%2Cf62%2Cf184%2Cf66%2Cf69%2Cf72%2Cf75%2Cf78%2Cf81%2Cf84%2Cf87%2Cf204%2Cf205%2Cf124%2Cf1%2Cf13'.format(page)
        
        #http://dcfm.eastmoney.com/EM_MutiSvcExpandInterface/api/js/get?type=HSGT20_GGTJ_SUM&token=894050c76af8597a853f5b408b759f5d&st=ShareSZ_Chg_One&sr=-1&p=3&ps=50&js=var%20yfuBiVhO={pages:(tp),data:(x)}&filter=(DateType=%271%27%20and%20HdDate=%272021-03-31%27)&rt=53922635

        # 股东户数        
        # url= " http://datacenter-web.eastmoney.com/api/data/v1/get?callback=jQuery1123009916465672302577_1629616481931&sortColumns=HOLD_NOTICE_DATE%2CSECURITY_CODE&sortTypes=-1%2C-1&pageSize=50&pageNumber=1&reportName=RPT_HOLDERNUMLATEST&columns=SECURITY_CODE%2CSECURITY_NAME_ABBR%2CEND_DATE%2CINTERVAL_CHRATE%2CAVG_MARKET_CAP%2CAVG_HOLD_NUM%2CTOTAL_MARKET_CAP%2CTOTAL_A_SHARES%2CHOLD_NOTICE_DATE%2CHOLDER_NUM%2CPRE_HOLDER_NUM%2CHOLDER_NUM_CHANGE%2CHOLDER_NUM_RATIO%2CEND_DATE%2CPRE_END_DATE"eColumns=f2%2Cf3&source=WEB&client=WEB"
        
        heads = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"}
        try:
            #response = requests.get(url, headers = heads)
            resText = requests.get(url)
            soup = BeautifulSoup(resText.content, features='lxml')
            pattern = re.compile(r"\[{(.*)}}\);")
            resStr = pattern.search(soup.text).group()            
            resStr = re.sub(r'\[','',resStr)
            resStr = re.sub(r'\]','',resStr)            
            resStr = re.sub(r'{','',resStr)            
            resStr = re.sub(r"}}}\);",'',resStr)
            resStr = re.sub(r"\"f\d+\":",'',resStr)
            resStr = re.sub(r"\,\"-\"",'',resStr)
            resStrList = resStr.split('},')
            # i = 0
            # for singleStr in resStrList:
            #     #tmplist.append(resStr)
            #     #i = i + 1
            #     #if i % 21 == 0:
            #     #    result.append(tmplist)
            #     print("i=",i,",",resStrList[i])
            #     i = i + 1
        except requests.exceptions.RequestException as e:
            if self.tryMaxTime <= 0:
                print("连接超时")
                return False
            else:
                self.tryMaxTime -= 1
                self.request(page)
        
        return True,resStrList

if __name__ == "__main__":    
    stockmd_Scraping = EastMoneyScraping()
    scraped_result = stockmd_Scraping.start()
    print(len(scraped_result))
    print(scraped_result[0])
    
    with open('eastmoney_md.csv','w') as f:  #仅用于测试观察数据
        for line in scraped_result:
            x = str(line)+'\n'        
            f.write(x)

    # def job():
    #     stockmd_Scraping.start()
    # schedule.every(300).seconds.do(job) #每隔5分钟执行一次job 
    # schedule.run_pending()   
    # while True:
    #     schedule.run_pending()
    #     time.sleep(1)

以上代码中的url 网页地址字符串中,参数pn是页数,参数pz是每页的长度。

抓取结果中每条数据的格式如下:

60.68,44.0,"603102",1,"N百合",36409638.0,34722734.0,91.52,1686904.0,4.45,-24068600.0,-63.44,-12341038.0,-32.53,1643087379,95.97

说明:

  • 60.68, 最新价
  • 44.0,今日涨跌幅
  • “603102”, 股票代码
  • 1,
  • “N百合”, 股票名称
  • 35899926.0, 今日主力净流入金额
  • 34213022.0, 今日超大单净流入金额
  • 91.38, 今日超大单净流入占比
  • 1686904.0, 今日大单净额
  • 4.51, 今日大单净额占比
  • -23765200.0, 今日中单净流入额
  • -63.47, 今日中单净流入占比
  • -12134726.0, 今日小单净流入额
  • -32.41,今日小单净流入占比
  • 1643086953,
  • 95.88, 今日主力净流入金额占比

抓取旧浪网站的股票实时行情数据的完整python代码:

## 获取新浪多只股票行情的网页地址示例
# url = "http://hq.sinajs.cn/rn=3d0pa&list=sh688670,sh688511,sh688663,sz301025,sh600276,sz301012,sh688718,sh603115,sz300693,sz301037,sh605056,sh688509,sh605020,sh688609,sz300925,sz300995,sh605066"

import time
import datetime
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import random

import csv

page_size = 500

## 读取stocks_list.csv中的股票代码,包含4461只股票代码
with open('stocks_list.csv','r',encoding='utf-8') as f:
    reader = csv.reader(f)
    stocks_lst = []
    for line in reader:        
        stocks_lst.append(line[0])  
slicing_stockslst = [stocks_lst[i:i + page_size] for i in range(0, len(stocks_lst), page_size)]

#列表的最大长度为892,即一次最多可抓取892只股票的行情,这是新浪个股行情浏览网页的长度限制

##url_start = "http://hq.sinajs.cn/rn=3d0pa&list="

url_lst = []
stocks_str = ""
for sub_lst in slicing_stockslst:         
    for i in range(0,len(sub_lst)):
        stocks_str = stocks_str + sub_lst[i] + ','            
    url_lst.append("http://hq.sinajs.cn/rn=3d0pa&list=" + stocks_str +'"')
    stocks_str = ""

heads = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36   (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"}

date = time.localtime(int(time.time()))
dateStr = time.strftime("%Y-%m-%d %H:%M:%S",date)


thispage_stkinfo = []
for i in range(0,len(url_lst)):    
    r =requests.get(url_lst[i])
    # print(url_lst[i])
    time.sleep(random.randint(3,12)) # 抓取网页之间随机暂停几秒,防止端口被封
    soup = BeautifulSoup(r.content.decode('gbk'),features='lxml')    
    resStrList = soup.text.split(';')  
    # print(resStrList)    
    print(" 抓取第 {0} 到 {1} 只股票,时间 {2} ...".format((i)*page_size+1,(i+1)*page_size,dateStr))     
    for i in range(0,len(resStrList)-1):
        currline_stock = resStrList[i][11:]    
        currline_stock1 = re.sub(r'=',',',currline_stock)
        currline_stock2 = re.sub(r'\"','',currline_stock1)
        currline_stock3 = re.sub(r'_','',currline_stock2)        
        currline_stockinfo = currline_stock3.split(',')        
        result_stockinfo = currline_stockinfo[:11] + currline_stockinfo[31:33]    
        # print(result_stockinfo)    
        thispage_stkinfo.append(result_stockinfo) 
    

print('抓取的股票行情数据示例:')
print(thispage_stkinfo[0])

print('本次总共抓取 ' + str(len(thispage_stkinfo)-1)+ ' 只股票的信息')

# 把爬取的股票行情写入csv文件
with open('sina_md.csv','w',newline ='',encoding='utf-8') as csv_file:
    writer= csv.writer(csv_file)
    for onestock in thispage_stkinfo:
        if len(onestock) > 10: #删除第一行和最后一行,该行信息为[],split函数切分形成的
            writer.writerow(onestock)

以上代码中的stock_list.csv文件内容,一共4985只股票,如下图所示:

从财经门户网抓取A股实时行情含python完整代码

另外,其中的 “url = “http://hq.sinajs.cn/rn=3d0pa&list=sh688670,sh688511,sh688663” 网页地址字符串中包含的股票最大数量为892,即抓取网页中个股的数量不能超过892个,这是笔者测试得出的结果,理由不得而知,建议尽量数量不要太多,否则很容易被网站认定为恶意访问。

发布者:股市刺客,转载请注明出处:https://www.95sca.cn/archives/76220
站内所有文章皆来自网络转载或读者投稿,请勿用于商业用途。如有侵权、不妥之处,请联系站长并出示版权证明以便删除。敬请谅解!

(0)
股市刺客的头像股市刺客
上一篇 2024 年 7 月 11 日
下一篇 2024 年 7 月 11 日

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注