import json import datetime import requests import pandas as pd import os class MetaCrawler: header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'} limit_page_count = 10000 def __init__(self): return # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221288761509 def crawl_stocks(self, inFileName): stocks = [] if os.path.isfile(inFileName): inFp = open(inFileName, 'r', encoding='utf-8') for line in inFp.readlines(): line = line.strip() if line: stocks.append(json.loads(line)) inFp.close() outFp = open(inFileName, 'w', encoding='utf-8') inputs = [] inputs.append( {'NAME':'USD', 'CODE':'FX_USDKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_USDKRW'} ) # 미국 USD inputs.append( {'NAME':'JPY', 'CODE':'FX_JPYKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_JPYKRW'} ) # 일본 JPY inputs.append( {'NAME':'EUR', 'CODE':'FX_EURKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_EURKRW'} ) # 유럽연합 EUR' inputs.append( {'NAME':'CNY', 'CODE':'FX_CNYKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_CNYKRW'} ) # 중국 CNY inputs.append( {'NAME':'WTI', 'CODE':'OIL_CL', 'URL':'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=OIL_CL&fdtc=2'} ) # WTI inputs.append( {'NAME':'GOLD', 'CODE':'CMDT_GC', 'URL':'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_GC&fdtc=2'} ) # 국제 금 for i in range(len(inputs)): input = inputs[i] if len(stocks) == 0: meta = {} meta["NAME"] = input['NAME'] meta["CODE"] = input['CODE'] meta["PRICE"] = [] lastDay = "1900.01.01" else: meta = stocks[i] lastDay = meta['PRICE'][0]['DATE'] finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header).text) # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: break for j in range(0, len(html[0].values)): item = html[0].values[j] if input['NAME'] in ('USD', 'JPY', 'EUR', 'CNY'): if j == 0: continue if item[0] <= lastDay: finish = True break meta["PRICE"].append({ "DATE": item[0], # 날짜 "close": item[1], # 매매기준율 "diff": item[2] # 전일대비 }) elif input['NAME'] in ('WTI', 'GOLD'): if item[0] <= lastDay: finish = True break meta["PRICE"].append({ "DATE": item[0], # 날짜 "close": item[1], # 종가 "diff": item[2], # 전일대비 "rate": item[3] # 등락율 }) if finish: break meta["PRICE"] = sorted(meta["PRICE"], key=lambda x: x['DATE'], reverse=True) outFp.write(json.dumps(meta, ensure_ascii=False) + "\n") outFp.close() return # 투자자별 매매동향 (Trading_Trend) 크롤링 # (pri, 개인) # (for, 외국인) # (ins, 기관합) # (ins0, 금융투자) # (ins1, 보험) # (ins2, 투신 (사모)) # (ins3, 은행) # (ins4, 기타금융기관) # (ins5, 연기금 등) # (cor, 기타법인) # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221289696771&parentCategoryNo=&categoryNo=&viewDate=&isShowPopularPosts=false&from=postView def crawl_trading_trend(self, inFileName): stocks = [] if os.path.isfile(inFileName): inFp = open(inFileName, 'r', encoding='utf-8') for line in inFp.readlines(): line = line.strip() if line: stocks.append(json.loads(line)) inFp.close() outFp = open(inFileName, 'w', encoding='utf-8') today = datetime.datetime.now().strftime("%Y%m%d") url = 'http://finance.naver.com/sise/investorDealTrendDay.nhn?bizdate='+today+'&sosok=&page=' if len(stocks) == 0: lastDay = "1900.01.01" else: lastDay = stocks[0]['DATE'] previousDay = "" finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0) html = pd.read_html(requests.get(url + str(i), headers=self.header).text) # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 2: break for j in range(1, len(html[0].values)): item = html[0].values[j] if str(item[0]) == "nan": continue if "20" + item[0] <= lastDay or item[0] == previousDay: finish = True break meta = { "DATE": "20"+item[0], "pri": item[1], # 개인 "for": item[2], # 외국인 "ins": item[3], # 기관합 "ins0": item[4], # 금융투자 "ins1": item[5], # 보험 "ins2": item[6], # 투신 (사모) "ins3": item[7], # 은행 "ins4": item[8], # 기타금융기관 "ins5": item[9], # 연기금 등 "cor": item[10]} # 기타법인 outFp.write(json.dumps(meta, ensure_ascii=False) + "\n") print ("20"+item[0]) previousDay = html[0].values[2][0] if finish: break if len(stocks) > 0: for stock in stocks: outFp.write(json.dumps(stock, ensure_ascii=False) + "\n") outFp.close() return # 증시자금동향 (신용잔고, 펀드자금 잔고) 크롤링 # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221290138187&parentCategoryNo=&categoryNo=&viewDate=&isShowPopularPosts=false&from=postView def crawl_money_trend(self, inFileName): stocks = [] if os.path.isfile(inFileName): inFp = open(inFileName, 'r', encoding='utf-8') for line in inFp.readlines(): line = line.strip() if line: stocks.append(json.loads(line)) inFp.close() outFp = open(inFileName, 'w', encoding='utf-8') url = 'http://finance.naver.com/sise/sise_deposit.nhn?&page=' if len(stocks) == 0: lastDay = "1900.01.01" else: lastDay = stocks[0]['DATE'] previousDay = "" finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0, encoding='euc-kr') html = pd.read_html(requests.get(url + str(i), headers=self.header).text, encoding='euc-kr') # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 10: break for j in range(1, len(html[0].values)): item = html[0].values[j] if str(item[0]) == "nan": continue if "20"+item[0] <= lastDay or item[0] == previousDay: finish = True break meta = { "DATE": "20"+item[0], "dep1_1": item[1], # 고객예탁금 누적 "dep1_2": item[2], # 고객예탁금 당일 "dep2_1": item[3], # 신용잔고 누적 "dep2_2": item[4], # 신용잔고 당일 "dep3_1": item[5], # 주식형펀드 누적 "dep3_2": item[6], # 주식형펀드 당일 "dep4_1": item[7], # 혼합형펀드 누적 "dep4_2": item[8], # 혼합형펀드 당일 "dep5_1": item[9], # 채권형펀드 누적 "dep5_2": item[10]} # 채권형펀드 당일 outFp.write(json.dumps(meta, ensure_ascii=False) + "\n") print("20"+item[0]) if finish: break previousDay = html[0].values[2][0] if len(stocks) > 0: for stock in stocks: outFp.write(json.dumps(stock, ensure_ascii=False) + "\n") outFp.close() return # 국내 시장금리 크롤링 # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221292348073&parentCategoryNo=&categoryNo=&viewDate=&isShowPopularPosts=false&from=postView def crawl_interest_rates(self, inFileName): stocks = [] if os.path.isfile(inFileName): inFp = open(inFileName, 'r', encoding='utf-8') for line in inFp.readlines(): line = line.strip() if line: stocks.append(json.loads(line)) inFp.close() outFp = open(inFileName, 'w', encoding='utf-8') inputs = [] inputs.append({'NAME': '91일 CD금리', 'CODE': 'IRR_CD91', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CD91'}) inputs.append({'NAME': '콜금리', 'CODE': 'IRR_CALL', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CALL'}) inputs.append({'NAME': '국고채(3년)', 'CODE': 'IRR_GOVT03Y', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_GOVT03Y'}) inputs.append({'NAME': '회사채(3년)', 'CODE': 'IRR_CORP03Y', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CORP03Y'}) for i in range(len(inputs)): input = inputs[i] if len(stocks) == 0: meta = {} meta["NAME"] = input['NAME'] meta["CODE"] = input['CODE'] meta["PRICE"] = [] lastDay = "1900.01.01" else: meta = stocks[i] lastDay = meta['PRICE'][0]['DATE'] finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header).text) # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: break for j in range(len(html[0].values)): item = html[0].values[j] if str(item[0]) == "nan": continue if item[0] <= lastDay: finish = True break meta["PRICE"].append({ "DATE": item[0], "close": item[1], # 종가 "diff": item[2], # 전일대비 "rate": item[3]}) # 등락율 if finish: break print(meta["NAME"] + " / " + item[0]) meta["PRICE"] = sorted(meta["PRICE"], key=lambda x: x['DATE'], reverse=True) outFp.write(json.dumps(meta, ensure_ascii=False) + "\n") outFp.close() return