Crawling data for the S&P 500 companies from yahoo finance_ (request+XPath) VS just using Request

本文介绍了一种从维基百科抓取S&P500公司列表的方法,并通过Yahoo财经API获取这些公司的详细股票信息,包括价格、成交量、市值等关键指标。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

https://en.wikipedia.org/wiki/List_of_S%26P_500_companies

Using the Split function to Extract the content what  we need

#get every thing before '0001555280'
#CIK is unique so we can use last one of them to split a table
wikiFirstParse = wikiResponse.text.split('0001555280')[0]

#get every thing after the second '500 component stocks'
wikiTableParse = wikiFirstParse.split('500 component stocks')[3]

#finding a rule

#the symbol 'MMM' is on the 5th hyper link

But some symbols (such as  'AEE' is not on the next forth hyper link(after Bristol)), so we have to figure out another method

I found that the hyperlink contains "nyse" or "nasdaq" will include the symbol (in the text of html tag 'a')  

 

import requests

url = "https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch"

#Get More Ticker Symbols from WiKi
wikiURL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
companySymbolList = {"Company":[]}

wikiResponse = requests.get(wikiURL)
response = requests.get(url)

#extracting
#get every thing before '0001555280'
#CIK is unique so we can use last one of them to split a table
wikiFirstParse = wikiResponse.text.split('0001555280')[0]
#get every thing after the second '500 component stocks'
wikiTableParse = wikiFirstParse.split('500 component stocks')[3]

hyperLinkSplitWiki = wikiTableParse.split('href=')
#get data
#print(wikiTableParse.split("href=")[5]) # 5->9 ->13 ->...
#print(wikiTableParse.split("href=")[5].split('>')[1].split('<')[0])
for position in range(5,len(hyperLinkSplitWiki)):
    #tempData=wikiTableParse.split("href=")[position] #.split('>')[1].split('<')[0]

    if 'nyse' in hyperLinkSplitWiki[position] :
        tempData = hyperLinkSplitWiki[position].split('>')[1].split('<')[0]
        companySymbolList['Company'].append(tempData)

    if 'nasdaq' in hyperLinkSplitWiki[position]:
        tempData = hyperLinkSplitWiki[position].split('>')[1].split('<')[0]
        companySymbolList['Company'].append(tempData)

print(companySymbolList)

htmlText = response.text

indicatorDict = {"Previous Close":[],
              "Open":[],
              "Bid":[],
              "Ask":[],
              "Day's Range":[],
              "52 Week Range":[],
              "Volume":[],
              "Avg. Volume":[],
              "Market Cap":[],
              "Beta":[],
              "PE Ratio (TTM)":[],
              "EPS (TTM)":[],
              "Earnings Date":[],
              "Dividend & Yield":[],
              "Ex-Dividend Date":[],
              "1y Target Est":[]
              }


specialSplitDict={"Day's Range":"Day&#x27;s Range",
                  "52 Week Range": "52 Week Range",
                  "Dividend & Yield":"Dividend &amp; Yield"}

for indicator in indicatorDict:
    if indicator not in specialSplitDict:
        splitList = htmlText.split(indicator)
        secondSplitList = splitList[1].split("\">")[2]
        data = secondSplitList.split("</")[0]
    else:                                          #replaced
        splitList = htmlText.split(specialSplitDict[indicator])
        secondSplitList = splitList[1].split("\">")[1]
        data = secondSplitList.split("</td")[0]

    indicatorDict[indicator].append(data)

#print(indicatorDict)

{'Company': ['MMM', 'ABT', 'ABBV', 'ABMD', 'ACN', 'ATVI', 'ADBE', 'AMD', 'AAP', 'AES', 'AMG', 'AFL', 'A', 'APD', 'AKAM', 'ALK', 'ALB', 'ARE', 'ALXN', 'ALGN', 'ALLE', 'AGN', 'ADS', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AEE', 'AAL', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'ABC', 'AME', 'AMGN', 'APH', 'APC', 'ADI', 'ANSS', 'ANTM', 'AON', 'AOS', 'APA', 'AIV', 'AAPL', 'AMAT', 'APTV', 'ADM', 'ARNC', 'ANET', 'AJG', 'AIZ', 'ATO', 'T', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'BHGE', 'BLL', 'BAC', 'BK', 'BAX', 'BBT', 'BDX', 'BRK.B', 'BBY', 'BIIB', 'BLK', 'HRB', 'BA', 'BKNG', 'BWA', 'BXP', 'BSX', 'BMY', 'AVGO', 'BR', 'BF.B', 'CHRW', 'COG', 'CDNS', 'CPB', 'COF', 'CPRI', 'CAH', 'KMX', 'CCL', 'CAT', 'CBRE', 'CBS', 'CE', 'CELG', 'CNC', 'CNP', 'CTL', 'CERN', 'CF', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'XEC', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CTXS', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CAG', 'CXO', 'COP', 'ED', 'STZ', 'COO', 'CPRT', 'GLW', 'CTVA', 'COST', 'COTY', 'CCI', 'CSX', 'CMI', 'CVS', 'DHI', 'DHR', 'DRI', 'DVA', 'DE', 'DAL', 'XRAY', 'DVN', 'FANG', 'DLR', 'DFS', 'DISCA', 'DISCK', 'DISH', 'DG', 'DLTR', 'D', 'DOV', 'DOW', 'DTE', 'DUK', 'DRE', 'DD', 'DXC', 'ETFC', 'EMN', 'ETN', 'EBAY', 'ECL', 'EIX', 'EW', 'EA', 'EMR', 'ETR', 'EOG', 'EFX', 'EQIX', 'EQR', 'ESS', 'EL', 'EVRG', 'ES', 'RE', 'EXC', 'EXPE', 'EXPD', 'EXR', 'XOM', 'FFIV', 'FB', 'FAST', 'FRT', 'FDX', 'FIS', 'FITB', 'FE', 'FRC', 'FISV', 'FLT', 'FLIR', 'FLS', 'FMC', 'FL', 'F', 'FTNT', 'FTV', 'FBHS', 'FOXA', 'FOX', 'BEN', 'FCX', 'GPS', 'GRMN', 'IT', 'GD', 'GE', 'GIS', 'GM', 'GPC', 'GILD', 'GPN', 'GS', 'GWW', 'HAL', 'HBI', 'HOG', 'HIG', 'HAS', 'HCA', 'HCP', 'HP', 'HSIC', 'HSY', 'HES', 'HPE', 'HLT', 'HFC', 'HOLX', 'HD', 'HON', 'HRL', 'HST', 'HPQ', 'HUM', 'HBAN', 'HII', 'IDXX', 'INFO', 'ITW', 'ILMN', 'IR', 'INTC', 'ICE', 'IBM', 'INCY', 'IP', 'IPG', 'IFF', 'INTU', 'ISRG', 'IVZ', 'IPGP', 'IQV', 'IRM', 'JKHY', 'JEC', 'JBHT', 'JEF', 'SJM', 'JNJ', 'JCI', 'JPM', 'JNPR', 'KSU', 'K', 'KEY', 'KEYS', 'KMB', 'KIM', 'KMI', 'KLAC', 'KSS', 'KHC', 'KR', 'LB', 'LHX', 'LH', 'LRCX', 'LW', 'LEG', 'LEN', 'LLY', 'LNC', 'LIN', 'LKQ', 'LMT', 'L', 'LOW', 'LYB', 'MTB', 'MAC', 'M', 'MRO', 'MPC', 'MKTX', 'MAR', 'MMC', 'MLM', 'MAS', 'MA', 'MKC', 'MXIM', 'MCD', 'MCK', 'MDT', 'MRK', 'MET', 'MTD', 'MGM', 'MCHP', 'MU', 'MSFT', 'MAA', 'MHK', 'TAP', 'MDLZ', 'MNST', 'MCO', 'MS', 'MOS', 'MSI', 'MSCI', 'MYL', 'NDAQ', 'NOV', 'NKTR', 'NTAP', 'NFLX', 'NWL', 'NEM', 'NWSA', 'NWS', 'NEE', 'NLSN', 'NKE', 'NI', 'NBL', 'JWN', 'NSC', 'NTRS', 'NOC', 'NCLH', 'NRG', 'NUE', 'NVDA', 'ORLY', 'OXY', 'OMC', 'OKE', 'ORCL', 'PCAR', 'PKG', 'PH', 'PAYX', 'PYPL', 'PNR', 'PBCT', 'PEP', 'PKI', 'PRGO', 'PFE', 'PM', 'PSX', 'PNW', 'PXD', 'PNC', 'PPG', 'PPL', 'PFG', 'PG', 'PGR', 'PLD', 'PRU', 'PEG', 'PSA', 'PHM', 'PVH', 'QRVO', 'PWR', 'QCOM', 'DGX', 'RL', 'RJF', 'RTN', 'O', 'RHT', 'REG', 'REGN', 'RF', 'RSG', 'RMD', 'RHI', 'ROK', 'ROL', 'ROP', 'ROST', 'RCL', 'CRM', 'SBAC', 'SLB', 'STX', 'SEE', 'SRE', 'SHW', 'SPG', 'SWKS', 'SLG', 'SNA', 'SO', 'LUV', 'SPGI', 'SWK', 'SBUX', 'STT', 'SYK', 'STI', 'SIVB', 'SYMC', 'SYF', 'SNPS', 'SYY', 'TROW', 'TTWO', 'TPR', 'TGT', 'TEL', 'FTI', 'TFX', 'TXN', 'TXT', 'TMO', 'TIF', 'TWTR', 'TJX', 'TMK', 'TSS', 'TSCO', 'TDG', 'TRV', 'TRIP', 'TSN', 'UDR', 'ULTA', 'USB', 'UAA', 'UA', 'UNP', 'UAL', 'UNH', 'UPS', 'URI', 'UTX', 'UHS', 'UNM', 'VFC', 'VLO', 'VAR', 'VTR', 'VRSN', 'VRSK', 'VZ', 'VRTX', 'VIAB', 'V', 'VNO', 'VMC', 'WAB', 'WMT', 'WBA', 'DIS', 'WM', 'WAT', 'WEC', 'WCG', 'WFC', 'WELL', 'WDC', 'WU', 'WRK', 'WY', 'WHR', 'WMB', 'WLTW', 'WYNN', 'XEL', 'XRX', 'XLNX', 'XYL', 'YUM', 'ZBH', 'ZION', 'ZTS']}

#####################################################################################################

"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:sNp.py
@time: 7/7/2019 3:42 PM
"""

import requests

#Get More Ticker Symbols from WiKi
wikiURL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
wikiResponse = requests.get(wikiURL)

companySymbolList = []

#extracting
#get every thing before '0001555280'
#CIK is unique so we can use last one of them to split a table
wikiFirstParse = wikiResponse.text.split('0001555280')[0]
#get every thing after the second '500 component stocks'
wikiTableParse = wikiFirstParse.split('500 component stocks')[3]

hyperLinkSplitWiki = wikiTableParse.split('href=')
#get data
#print(wikiTableParse.split("href=")[5]) # 5->9 ->13 ->...
#print(wikiTableParse.split("href=")[5].split('>')[1].split('<')[0])
for position in range(5,len(hyperLinkSplitWiki)):
    #tempData=wikiTableParse.split("href=")[position] #.split('>')[1].split('<')[0]
    if 'nyse' in hyperLinkSplitWiki[position] :
        tempData = hyperLinkSplitWiki[position].split('>')[1].split('<')[0]
        companySymbolList.append(tempData)

    if 'nasdaq' in hyperLinkSplitWiki[position]:
        tempData = hyperLinkSplitWiki[position].split('>')[1].split('<')[0]
        companySymbolList.append(tempData)


#url = "https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch"
for tickerSymbol in companySymbolList:
    url = "https://finance.yahoo.com/quote/" + tickerSymbol + "?p=" + tickerSymbol + "&.tsrc=fin-srch"

    response = requests.get(url)
    htmlText = response.text

    indicatorDict = {"Previous Close":[],
                  "Open":[],
                  "Bid":[],
                  "Ask":[],
                  "Day's Range":[],
                  "52 Week Range":[],
                  "Volume":[],
                  "Avg. Volume":[],
                  "Market Cap":[],
                  "Beta":[],
                  "PE Ratio (TTM)":[],
                  "EPS (TTM)":[],
                  "Earnings Date":[],
                  "Dividend & Yield":[],
                  "Ex-Dividend Date":[],
                  "1y Target Est":[]
                  }


    specialSplitDict={"Day's Range":"Day&#x27;s Range",
                      "52 Week Range": "52 Week Range",
                      "Dividend & Yield":"Dividend &amp; Yield"}

    for indicator in indicatorDict:
        if indicator not in specialSplitDict:
            splitList = htmlText.split(indicator)
            secondSplitList = splitList[1].split("\">")[2]
            data = secondSplitList.split("</")[0]
        else:                                          #replaced
            splitList = htmlText.split(specialSplitDict[indicator])
            secondSplitList = splitList[1].split("\">")[1]
            data = secondSplitList.split("</td")[0]

        indicatorDict[indicator].append(data)

    print(tickerSymbol,": ", indicatorDict)

"list index out of range" since the BF.S was not normal stock and has different quotes summary. And I don't want it, so I can fill its data with ‘N/A’’

for indicator in indicatorDict:
    try:
        if indicator not in specialSplitDict:
            splitList = htmlText.split(indicator)
            secondSplitList = splitList[1].split("\">")[2]
            data = secondSplitList.split("</")[0]
        else:                                          #replaced
            splitList = htmlText.split(specialSplitDict[indicator])
            secondSplitList = splitList[1].split("\">")[1]
            data = secondSplitList.split("</td")[0]
        indicatorDict[indicator].append(data)
    except:
        indicatorDict[indicator].append('N/A')

print(tickerSymbol,": ", indicatorDict)

##############################################all codes###########################################

#you may found some codes are different with previous since the website changed the data table structure.

################################################################################################

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:sNp.py
@time: 7/7/2019 3:42 PM
"""

import requests
import pandas as pd

#Get More Ticker Symbols from WiKi
wikiURL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
wikiResponse = requests.get(wikiURL)

companySymbolDict = {"Ticker":[]}

#extracting
#get every thing before '0001555280'
#CIK is unique so we can use last one of them to split a table
wikiFirstParse = wikiResponse.text.split('0001555280')[0]
#get every thing after the second '500 component stocks'
wikiTableParse = wikiFirstParse.split('500 component stocks')[3]

hyperLinkSplitWiki = wikiTableParse.split('href=')
#get data
#print(wikiTableParse.split("href=")[5]) # 5->9 ->13 ->...
#print(wikiTableParse.split("href=")[5].split('>')[1].split('<')[0])
for position in range(5,len(hyperLinkSplitWiki)):
    #tempData=wikiTableParse.split("href=")[position] #.split('>')[1].split('<')[0]
    if 'nyse' in hyperLinkSplitWiki[position] :
        tempData = hyperLinkSplitWiki[position].split('>')[1].split('<')[0]
        companySymbolDict["Ticker"].append(tempData)

    if 'nasdaq' in hyperLinkSplitWiki[position]:
        tempData = hyperLinkSplitWiki[position].split('>')[1].split('<')[0]
        companySymbolDict["Ticker"].append(tempData)


indicatorDict={"Previous Close":[],
               "Open":[],
               "Bid":[],
               "Ask":[],
               "Day's Range":[],
               "52 Week Range":[],
               "Volume":[],
               "Avg. Volume":[],
               "Market Cap":[],
               "Beta":[],
               "PE Ratio (TTM)":[],
               "EPS (TTM)":[],
               "Earnings Date":[],
               "Dividend & Yield":[],
               "Ex-Dividend Date":[],
               "1y Target Est":[]}

specialSplitDict={"Day's Range":"Day&#x27;s Range",
                  "52 Week Range":"52 Week Range",
                  "Dividend & Yield":"Dividend &amp; Yield",
                  "Bid":"Bid",
                  "Ask":"Ask"}

#url = "https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch"
for tickerSymbol in companySymbolDict["Ticker"]:
    url = "https://finance.yahoo.com/quote/" + tickerSymbol + "?p=" + tickerSymbol + "&.tsrc=fin-srch"

    response = requests.get(url)
    htmlText = response.text

    for indicator in indicatorDict:
        try:
            if indicator not in specialSplitDict:
                splitList = htmlText.split(indicator)
                secondSplitList = splitList[1].split("\">")[2]
                data = secondSplitList.split("</")[0]
            else:                                          #replaced
                splitList = htmlText.split(specialSplitDict[indicator])
                secondSplitList = splitList[1].split("\">")[1]
                data = secondSplitList.split("</td")[0]
            indicatorDict[indicator].append(data)
        except:
            indicatorDict[indicator].append('N/A')

    #print(tickerSymbol,": ", indicatorDict)

companySymbolDict.update(indicatorDict) #same number of rows

print(len(companySymbolDict["Ticker"])) #504
df = pd.DataFrame(companySymbolDict)

print(df.head())

Jupyter Notebook:

Pycharm

#####################################################################################################

XPath Version

#####################################################################################################

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:sNp500XPath.py
@time: 7/8/2019 1:27 AM
"""

import requests
import pandas as pd
from lxml import etree

#Get More Ticker Symbols from WiKi
wikiURL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
wikiResponse = requests.get(wikiURL)
html = wikiResponse.content.decode('utf-8')
#print(html)

companySymbolDict = {"Ticker":[]} # pd.DataFrame(companySymbolDict)

selector = etree.HTML(html)                                                             #"nasdaq"

tickerSymbolList=selector.xpath('//table[@id="constituents"]/tbody/tr/td/a[contains(@href, "nyse")]/text()')
companySymbolDict["Ticker"].extend(tickerSymbolList)
tickerSymbolList=selector.xpath('//table[@id="constituents"]/tbody/tr/td/a[contains(@href, "nasdaq")]/text()')
companySymbolDict["Ticker"].extend(tickerSymbolList)

print(companySymbolDict)
print(len(companySymbolDict["Ticker"]))

output:

D:\Anaconda3\python.exe J:/PycharmProjects/scrapers/yahoo/sNp500XPath.py
{'Ticker': ['MMM', 'ABT', 'ABBV', 'ACN', 'AAP', 'AES', 'AMG', 'AFL', 'A', 'APD', 'ALK', 'ALB', 'ARE', 'ALLE', 'AGN', 'ADS', 'LNT', 'ALL', 'MO', 'AMCR', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'ABC', 'AME', 'APH', 'APC', 'ANTM', 'AON', 'AOS', 'APA', 'AIV', 'APTV', 'ADM', 'ARNC', 'ANET', 'AJG', 'AIZ', 'ATO', 'T', 'AZO', 'AVB', 'AVY', 'BHGE', 'BLL', 'BAC', 'BK', 'BAX', 'BBT', 'BDX', 'BRK.B', 'BBY', 'BLK', 'HRB', 'BA', 'BWA', 'BXP', 'BSX', 'BMY', 'BR', 'BF.B', 'COG', 'CPB', 'COF', 'CPRI', 'CAH', 'KMX', 'CCL', 'CAT', 'CBRE', 'CBS', 'CE', 'CNC', 'CNP', 'CTL', 'CF', 'SCHW', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'XEC', 'C', 'CFG', 'CLX', 'CMS', 'KO', 'CL', 'CMA', 'CAG', 'CXO', 'COP', 'ED', 'STZ', 'COO', 'GLW', 'CTVA', 'COTY', 'CCI', 'CMI', 'CVS', 'DHI', 'DHR', 'DRI', 'DVA', 'DE', 'DAL', 'DVN', 'DLR', 'DFS', 'DG', 'D', 'DOV', 'DOW', 'DTE', 'DUK', 'DRE', 'DD', 'DXC', 'EMN', 'ETN', 'ECL', 'EIX', 'EW', 'EMR', 'ETR', 'EOG', 'EFX', 'EQR', 'ESS', 'EL', 'EVRG', 'ES', 'RE', 'EXC', 'EXR', 'XOM', 'FRT', 'FDX', 'FIS', 'FE', 'FRC', 'FLT', 'FLS', 'FMC', 'FL', 'F', 'FTV', 'FBHS', 'BEN', 'FCX', 'GPS', 'IT', 'GD', 'GE', 'GIS', 'GM', 'GPC', 'GPN', 'GS', 'GWW', 'HAL', 'HBI', 'HOG', 'HIG', 'HCA', 'HCP', 'HP', 'HSY', 'HES', 'HPE', 'HLT', 'HFC', 'HD', 'HON', 'HRL', 'HST', 'HPQ', 'HUM', 'HII', 'ITW', 'IR', 'ICE', 'IBM', 'IP', 'IPG', 'IFF', 'IVZ', 'IQV', 'IRM', 'JEC', 'JEF', 'SJM', 'JNJ', 'JCI', 'JPM', 'JNPR', 'KSU', 'K', 'KEY', 'KEYS', 'KMB', 'KIM', 'KMI', 'KSS', 'KR', 'LB', 'LHX', 'LH', 'LW', 'LEG', 'LEN', 'LLY', 'LNC', 'LIN', 'LMT', 'L', 'LOW', 'LYB', 'MTB', 'MAC', 'M', 'MRO', 'MPC', 'MMC', 'MLM', 'MAS', 'MA', 'MKC', 'MCD', 'MCK', 'MDT', 'MRK', 'MET', 'MTD', 'MGM', 'MAA', 'MHK', 'TAP', 'MCO', 'MS', 'MOS', 'MSI', 'MSCI', 'NOV', 'NEM', 'NEE', 'NLSN', 'NKE', 'NI', 'NBL', 'JWN', 'NSC', 'NOC', 'NCLH', 'NRG', 'NUE', 'OXY', 'OMC', 'OKE', 'ORCL', 'PKG', 'PH', 'PNR', 'PKI', 'PRGO', 'PFE', 'PM', 'PSX', 'PNW', 'PXD', 'PNC', 'PPG', 'PPL', 'PFG', 'PG', 'PGR', 'PLD', 'PRU', 'PEG', 'PSA', 'PHM', 'PVH', 'PWR', 'DGX', 'RL', 'RJF', 'RTN', 'O', 'RHT', 'REG', 'RF', 'RSG', 'RMD', 'RHI', 'ROK', 'ROL', 'ROP', 'RCL', 'CRM', 'SLB', 'SEE', 'SRE', 'SHW', 'SPG', 'SLG', 'SNA', 'SO', 'LUV', 'SPGI', 'SWK', 'STT', 'SYK', 'STI', 'SYF', 'SYY', 'TPR', 'TGT', 'TEL', 'FTI', 'TFX', 'TXT', 'TMO', 'TIF', 'TWTR', 'TJX', 'TMK', 'TSS', 'TDG', 'TRV', 'TSN', 'UDR', 'USB', 'UAA', 'UA', 'UNP', 'UAL', 'UNH', 'UPS', 'URI', 'UTX', 'UHS', 'UNM', 'VFC', 'VLO', 'VAR', 'VTR', 'VZ', 'V', 'VNO', 'VMC', 'WAB', 'WMT', 'DIS', 'WM', 'WAT', 'WEC', 'WCG', 'WFC', 'WELL', 'WU', 'WRK', 'WY', 'WHR', 'WMB', 'XEL', 'XRX', 'XYL', 'YUM', 'ZBH', 'ZTS', 'ABMD', 'ATVI', 'ADBE', 'AMD', 'AKAM', 'ALXN', 'ALGN', 'GOOGL', 'GOOG', 'AMZN', 'AAL', 'AMGN', 'ADI', 'ANSS', 'AAPL', 'AMAT', 'ADSK', 'ADP', 'BIIB', 'BKNG', 'AVGO', 'CHRW', 'CDNS', 'CELG', 'CERN', 'CHTR', 'CINF', 'CTAS', 'CSCO', 'CTXS', 'CME', 'CTSH', 'CMCSA', 'CPRT', 'COST', 'CSX', 'XRAY', 'FANG', 'DISCA', 'DISCK', 'DISH', 'DLTR', 'ETFC', 'EBAY', 'EA', 'EQIX', 'EXPE', 'EXPD', 'FFIV', 'FB', 'FAST', 'FITB', 'FISV', 'FLIR', 'FTNT', 'FOXA', 'FOX', 'GRMN', 'GILD', 'HAS', 'HSIC', 'HOLX', 'HBAN', 'IDXX', 'INFO', 'ILMN', 'INTC', 'INCY', 'INTU', 'ISRG', 'IPGP', 'JKHY', 'JBHT', 'KLAC', 'KHC', 'LRCX', 'LKQ', 'MKTX', 'MAR', 'MXIM', 'MCHP', 'MU', 'MSFT', 'MDLZ', 'MNST', 'MYL', 'NDAQ', 'NKTR', 'NTAP', 'NFLX', 'NWL', 'NWSA', 'NWS', 'NTRS', 'NVDA', 'ORLY', 'PCAR', 'PAYX', 'PYPL', 'PBCT', 'PEP', 'QRVO', 'QCOM', 'REGN', 'ROST', 'SBAC', 'STX', 'SWKS', 'SBUX', 'SIVB', 'SYMC', 'SNPS', 'TROW', 'TTWO', 'TXN', 'TSCO', 'TRIP', 'ULTA', 'VRSN', 'VRSK', 'VRTX', 'VIAB', 'WBA', 'WDC', 'WLTW', 'WYNN', 'XLNX', 'ZION']}
504

Process finished with exit code 0

#####################################################################################################

I want to save the data of the company symbol tickers(the list in the dict{key:[]}) and quote-summaries(the list in dict{key:[]}) in a dataframe, so they have to be the same length 

Volume and Volume .Avg

Solutions:

span[text()="{0}"] instead of span[contains(text(),"{0}"]

two earnings date:

Solution:

del(indicatorDict[indicator][-1])

#######################################Request + Xpath All Codes######################################

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:sNp500XPath.py
@time: 7/8/2019 1:27 AM
"""

import requests
import pandas as pd
from lxml import etree

#Get More Ticker Symbols from WiKi
wikiURL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
wikiResponse = requests.get(wikiURL)
html = wikiResponse.content.decode('utf-8')
#print(html)

companySymbolDict = {"Ticker":[]} # pd.DataFrame(companySymbolDict)

selector = etree.HTML(html)                                                             #"nasdaq"

tickerSymbolList=selector.xpath('//table[@id="constituents"]/tbody/tr/td/a[contains(@href, "nyse")]/text()')
companySymbolDict["Ticker"].extend(tickerSymbolList)
tickerSymbolList=selector.xpath('//table[@id="constituents"]/tbody/tr/td/a[contains(@href, "nasdaq")]/text()')
companySymbolDict["Ticker"].extend(tickerSymbolList)

# print(companySymbolDict)
# print(len(companySymbolDict["Ticker"]))

indicatorDict={"Previous Close":[],
               "Open":[],
               "Bid":[],
               "Ask":[],
               "Day's Range":[],
               "52 Week Range":[],
               "Volume":[],
               "Avg. Volume":[], #note: space after dot
               "Market Cap":[],
               "Beta (3Y Monthly)":[],
               "PE Ratio (TTM)":[],
               "EPS (TTM)":[],
               "Earnings Date":[],
               "Forward Dividend & Yield":[],
               "Ex-Dividend Date":[],
               "1y Target Est":[]}

specialSplitDict={"Day's Range":"Day&#x27;s Range",
                  "52 Week Range": "52 Week Range",
                  "Forward Dividend & Yield":"Forward Dividend &amp; Yield",
                  #"Bid": "Bid", #sometimes you will not see these data and 
                  #"Ask":"Ask"   #will cause the webpage's structure to be changed (a little bit)
                  }

counter=0
#url = "https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch"
for tickerSymbol in companySymbolDict["Ticker"]:
    url = "https://finance.yahoo.com/quote/" + tickerSymbol + "?p=" + tickerSymbol + "&.tsrc=fin-srch"
    #https://finance.yahoo.com/quote/BF.B?p=BF.B&.tsrc=fin-srch
    req = requests.get(url)
    html = req.content.decode('utf-8','ignore')
    selector = etree.HTML(html)

    dataList=[]
    counter=counter+1
    i=0
    for indicator in indicatorDict:
        i=i+1
        try:
            if indicator not in specialSplitDict:          #span[contains(text(),"{0}"]
                data = selector.xpath('//div[@id="quote-summary"]//span[text()='
                                      '"{0}"]/../../td[2]/span/text()'.format(indicator))
            else:                                         #span[contains(text(),"%s"]
                data = selector.xpath('//div[@id="quote-summary"]//span[text()='
                                     '"%s"]/../../td[2]/text()' % indicator)  #without replaced

            indicatorDict[indicator].extend(data) #dataList
            dataList.extend(data)
        except:
            indicatorDict[indicator].append('N/A')
            dataList.append('N/A')

        if(len(dataList)<i):
            dataList.append('N/A')
        if len(indicatorDict[indicator]) <counter:
            indicatorDict[indicator].append('N/A')
        if(len(dataList)>i):
            del(dataList[-1])
        if len(indicatorDict[indicator]) > counter:
            del(indicatorDict[indicator][-1])
    print(counter,tickerSymbol,dataList)


companySymbolDict.update(indicatorDict)
#print(companySymbolDict)

print(len(companySymbolDict['Ticker'])) #504
print(len(indicatorDict["Previous Close"])) #504
df = pd.DataFrame(companySymbolDict)

print(df.head)

part of output snapshot

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

LIQING LIN

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值