import os import datetime import requests import sqlite3 from time import sleep import pandas as pd class MetaCrawler: header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'} limit_page_count = 1000000 START_DATE = None def __init__(self, START_DATE): self.START_DATE = START_DATE return # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221288761509 def crawl_exchange(self, inFileName): tableName = 'meta_1' conn = sqlite3.connect(inFileName) cursor = conn.cursor() # 테이블 생성 cursor.execute("CREATE TABLE IF NOT EXISTS "+tableName+" (CODE text, NAME text, ymd text, price REAL, diff REAL, cash_buy REAL, cash_sell REAL, transfer_buy REAL, transfer_sell REAL)") # 키 생성 create_key = "CREATE INDEX IF NOT EXISTS " + tableName + "_idx on " + tableName + " (CODE, ymd) " cursor.execute(create_key) inputs = [] inputs.append( {'NAME':'USD', 'CODE': 'FX_USDKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_USDKRW'} ) # 미국 USD inputs.append( {'NAME':'JPY', 'CODE': 'FX_JPYKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_JPYKRW'} ) # 일본 JPY inputs.append( {'NAME':'EUR', 'CODE': 'FX_EURKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_EURKRW'} ) # 유럽연합 EUR' inputs.append( {'NAME':'CNY', 'CODE': 'FX_CNYKRW', 'URL':'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_CNYKRW'} ) # 중국 CNY inputs.append({'NAME': 'HKD', 'CODE': 'FX_HKDKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_HKDKRW'}) # 홍콩 HKD inputs.append({'NAME': 'GBP', 'CODE': 'FX_GBPKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_GBPKRW'}) # 영국 GRP inputs.append({'NAME': 'CAD', 'CODE': 'FX_CADKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_CADKRW'}) # 캐나다CAD inputs.append({'NAME': 'CHF', 'CODE': 'FX_CHFKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_CHFKRW'}) # 스위스 CHF inputs.append({'NAME': 'AUD', 'CODE': 'FX_AUDKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_AUDKRW'}) # 호주 AUD inputs.append({'NAME': 'THB', 'CODE': 'FX_THBKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_THBKRW'}) # 태국 THB inputs.append({'NAME': 'INR', 'CODE': 'FX_INRKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_INRKRW'}) # 인도 INR inputs.append({'NAME': 'PHP', 'CODE': 'FX_PHPKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_PHPKRW'}) # 필리핀 PHP inputs.append({'NAME': 'BRL', 'CODE': 'FX_BRLKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_BRLKRW'}) # 브라질 BRL inputs.append({'NAME': 'VND', 'CODE': 'FX_VNDKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_VNDKRW'}) # 베트남 VND inputs.append({'NAME': 'RUB', 'CODE': 'FX_RUBKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_RUBKRW'}) # 러시아 RUB inputs.append({'NAME': 'TWD', 'CODE': 'FX_TWDKRW', 'URL': 'http://finance.naver.com/marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_TWDKRW'}) # 대만 TWD for idx in range(len(inputs)): input = inputs[idx] NAME = input['NAME'] CODE = input['CODE'] cursor.execute('SELECT ymd FROM ' + tableName + ' WHERE CODE=? order by ymd desc', (CODE,)) result = cursor.fetchone() if result == None: lastDay = self.START_DATE else: lastDay = result[0] finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) html = None while True: try: html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) sleep(0.5) break except: print(input['URL'] + '&page=%s' % i) if i > 200: break continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: break for j in range(0, len(html[0].values)): item = html[0].values[j] if input['NAME'] in ('USD', 'JPY', 'EUR', 'CNY', 'HKD', 'GBP', 'CAD', 'CHF', 'AUD', 'THB', 'INR', 'PHP', 'BRL', 'VND', 'RUB', 'TWD'): if item[0] <= lastDay: finish = True break ymd = item[0] # 날짜 price = item[1] # 매매기준율 diff = item[2] # 전일대비 cash_buy = item[3] # 현찰 사실 때 cash_sell = item[4] # 현찰 파실 때 transfer_buy = item[5] # 송금 사실 때 transfer_sell = item[6] # 송금 파실 때 cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=? and ymd=?', (CODE,ymd,)) result = cursor.fetchone() if result == None: cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, ymd, price, diff, cash_buy, cash_sell, transfer_buy, transfer_sell) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)", (CODE, NAME, ymd, price, diff, cash_buy, cash_sell, transfer_buy, transfer_sell)) else: # cursor.execute("UPDATE " + tableName + " SET price=?, diff=?, cash_buy=?, cash_sell=?, transfer_buy=?, transfer_sell=? WHERE CODE=? and ymd=?", (price, diff, cash_buy, cash_sell, transfer_buy, transfer_sell, CODE, ymd)) finish = True break print("{}. {} {} ({})".format(idx, ymd, CODE, NAME)) if finish: break conn.commit() cursor.close() conn.close() return # 투자자별 매매동향 (Trading_Trend) 크롤링 # (pri, 개인) # (for, 외국인) # (ins, 기관합) # (ins0, 금융투자) # (ins1, 보험) # (ins2, 투신 (사모)) # (ins3, 은행) # (ins4, 기타금융기관) # (ins5, 연기금 등) # (cor, 기타법인) # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221289696771&parentCategoryNo=&categoryNo=&viewDate=&isShowPopularPosts=false&from=postView def crawl_trading_trend(self, inFileName): tableName = 'meta_2' conn = sqlite3.connect(inFileName) cursor = conn.cursor() # 테이블 생성 cursor.execute("CREATE TABLE IF NOT EXISTS "+tableName+" (ymd text PRIMARY KEY, pri integer, fori integer, ins integer, ins0 integer, ins1 integer, ins2 integer, ins3 integer, ins4 integer, ins5 integer, cor integer)") # 키 생성 create_key = "CREATE INDEX IF NOT EXISTS "+tableName+"_idx on "+tableName+" (ymd) " cursor.execute(create_key) cursor.execute('SELECT ymd FROM ' + tableName + ' order by ymd desc') result = cursor.fetchone() if result == None: lastDay = self.START_DATE else: lastDay = result[0] today = datetime.datetime.now().strftime("%Y%m%d") url = 'http://finance.naver.com/sise/investorDealTrendDay.nhn?bizdate='+today+'&sosok=&page=' previousDay = "" finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0) html = None while True: try: html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text) sleep(0.5) break except: print(url + str(i)) if i > 200: break continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 2: break for j in range(0, len(html[0].values)): item = html[0].values[j] if str(item[0]) == "nan": continue if "20" + item[0] <= lastDay or item[0] == previousDay: finish = True break ymd = "20"+item[0] pri = item[1] # 개인 fori = item[2] # 외국인 ins = item[3] # 기관합 ins0 = item[4] # 금융투자 ins1 = item[5] # 보험 ins2 = item[6] # 투신 (사모) ins3 = item[7] # 은행 ins4 = item[8] # 기타금융기관 ins5 = item[9] # 연기금 등 cor = item[10] # 기타법인 cursor.execute('SELECT * FROM ' + tableName + ' WHERE ymd=?', (ymd,)) result = cursor.fetchone() if result == None: cursor.execute("INSERT INTO " + tableName + "(ymd, pri, fori, ins, ins0, ins1, ins2, ins3, ins4, ins5, cor) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (ymd, pri, fori, ins, ins0, ins1, ins2, ins3, ins4, ins5, cor)) else: # cursor.execute("UPDATE " + tableName + " SET pri=?, fori=?, ins=?, ins0=?, ins1=?, ins2=?, ins3=?, ins4=?, ins5=?, cor=? WHERE ymd=?", (pri, fori, ins, ins0, ins1, ins2, ins3, ins4, ins5, cor, ymd)) finish = True break print ("Trading_Trend 20{}".format(item[0])) previousDay = html[0].values[2][0] if finish: break conn.commit() cursor.close() conn.close() return # 증시자금동향 (신용잔고, 펀드자금 잔고) 크롤링 # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221290138187&parentCategoryNo=&categoryNo=&viewDate=&isShowPopularPosts=false&from=postView def crawl_money_trend(self, inFileName): tableName = 'meta_3' conn = sqlite3.connect(inFileName) cursor = conn.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS "+tableName+" (ymd text, dep1_1 integer, dep1_2 integer, dep2_1 integer, dep2_2 integer, dep3_1 integer, dep3_2 integer, dep4_1 integer, dep4_2 integer, dep5_1 integer, dep5_2 integer)") # 키 생성 create_key = "CREATE INDEX IF NOT EXISTS "+tableName+"_idx on "+tableName+" (ymd) " cursor.execute(create_key) cursor.execute('SELECT ymd FROM ' + tableName + ' order by ymd desc') result = cursor.fetchone() if result == None: lastDay = self.START_DATE else: lastDay = result[0] previousDay = "" url = 'http://finance.naver.com/sise/sise_deposit.nhn?&page=' finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0, encoding='euc-kr') html = None while True: try: html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text, encoding='euc-kr') sleep(0.5) break except: print(url + str(i)) if i > 200: break continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 10: break for j in range(0, len(html[0].values)): item = html[0].values[j] if str(item[0]) == "nan": continue if "20"+item[0] <= lastDay or item[0] == previousDay: finish = True break meta = { "ymd": "20"+item[0], "dep1_1": item[1], # 고객예탁금 누적 "dep1_2": item[2], # 고객예탁금 당일 "dep2_1": item[3], # 신용잔고 누적 "dep2_2": item[4], # 신용잔고 당일 "dep3_1": item[5], # 주식형펀드 누적 "dep3_2": item[6], # 주식형펀드 당일 "dep4_1": item[7], # 혼합형펀드 누적 "dep4_2": item[8], # 혼합형펀드 당일 "dep5_1": item[9], # 채권형펀드 누적 "dep5_2": item[10]} # 채권형펀드 당일 cursor.execute('SELECT * FROM ' + tableName + ' WHERE ymd=?', (meta["ymd"],)) result = cursor.fetchone() if result == None: cursor.execute("INSERT INTO " + tableName + "(ymd, dep1_1, dep1_2, dep2_1, dep2_2, dep3_1, dep3_2, dep4_1, dep4_2, dep5_1, dep5_2) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (meta["ymd"], meta["dep1_1"], meta["dep1_2"], meta["dep2_1"], meta["dep2_2"], meta["dep3_1"], meta["dep3_2"], meta["dep4_1"], meta["dep4_2"], meta["dep5_1"], meta["dep5_2"])) else: # cursor.execute("UPDATE " + tableName + " SET dep1_1=?, dep1_2=?, dep2_1=?, dep2_2=?, dep3_1=?, dep3_2=?, dep4_1=?, dep4_2=?, dep5_1=?, dep5_2=? WHERE ymd=?", (meta["dep1_1"], meta["dep1_2"], meta["dep2_1"], meta["dep2_2"], meta["dep3_1"], meta["dep3_2"], meta["dep4_1"], meta["dep4_2"], meta["dep5_1"], meta["dep5_2"], meta["ymd"])) finish = True break print("crawl_money_trend 20{}".format(item[0])) if finish: break previousDay = html[0].values[2][0] conn.commit() cursor.close() conn.close() return # 국내 시장금리 크롤링 # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221292348073&parentCategoryNo=&categoryNo=&viewDate=&isShowPopularPosts=false&from=postView def crawl_interest_rates(self, inFileName): tableName = 'meta_4' conn = sqlite3.connect(inFileName) cursor = conn.cursor() # 테이블 생성 cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text, NAME text, ymd text, close REAL, diff REAL, rate REAL)") # 키 생성 create_key = "CREATE INDEX IF NOT EXISTS "+tableName+"_idx on "+tableName+" (CODE, ymd) " cursor.execute(create_key) inputs = [] inputs.append({'NAME': '91일 CD금리', 'CODE': 'IRR_CD91', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CD91'}) inputs.append({'NAME': '콜금리', 'CODE': 'IRR_CALL', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CALL'}) inputs.append({'NAME': '국고채(3년)', 'CODE': 'IRR_GOVT03Y', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_GOVT03Y'}) inputs.append({'NAME': '회사채(3년)', 'CODE': 'IRR_CORP03Y', 'URL': 'http://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CORP03Y'}) for idx in range(len(inputs)): input = inputs[idx] NAME = input['NAME'] CODE = input['CODE'] cursor.execute('SELECT ymd FROM ' + tableName + ' WHERE CODE=? order by ymd desc', (CODE,)) result = cursor.fetchone() if result == None: lastDay = self.START_DATE else: lastDay = result[0] finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) html = None while True: try: html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) sleep(0.5) break except: print (input['URL'] + '&page=%s' % i) if i>200: break continue ymd, close, diff, rate = "", 0.0, 0.0, 0.0 # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: break for j in range(len(html[0].values)): item = html[0].values[j] if str(item[0]) == "nan": continue if item[0] <= lastDay: finish = True break ymd = item[0] close = item[1] # 종가 diff = item[2] # 전일대비 rate = item[3] # 등락율 cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=? and ymd=?', (CODE, ymd,)) result = cursor.fetchone() if result == None: cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, ymd, close, diff, rate) VALUES(?, ?, ?, ?, ?, ?)", (CODE, NAME, ymd, close, diff, rate)) else: # cursor.execute("UPDATE " + tableName + " SET close=?, diff=?, rate=? WHERE CODE=? and ymd=?", (close, diff, rate, CODE, ymd)) finish = True break if finish: break print("{} {}".format(ymd, NAME)) conn.commit() cursor.close() conn.close() return # 참고) http://blog.naver.com/PostView.nhn?blogId=koko8624&logNo=221288761509 def crawl_meterials(self, inFileName): tableName = 'meta_5' conn = sqlite3.connect(inFileName) cursor = conn.cursor() # 테이블 생성 cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text, NAME text, ymd text, close REAL, diff REAL, rate REAL)") # 키 생성 create_key = "CREATE INDEX IF NOT EXISTS " + tableName + "_idx on " + tableName + " (CODE, NAME, ymd) " cursor.execute(create_key) inputs = [] inputs.append( {'NAME':'WTI', 'CODE':'OIL_CL', 'URL':'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=OIL_CL&fdtc=2'} ) # WTI inputs.append({'NAME': 'COPPER', 'CODE': 'CMDT_CDY','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_CDY&fdtc=2'}) # 구리 inputs.append({'NAME': 'NATURALGAS', 'CODE': 'CMDT_NG','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_NG&fdtc=2'}) # 천연가스 inputs.append({'NAME': 'CORN', 'CODE': 'CMDT_C','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_C&fdtc=2'}) # 국제 옥수수 inputs.append({'NAME': 'SOYBEAN', 'CODE': 'CMDT_S','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_S&fdtc=2'}) # 국제 대두 inputs.append({'NAME': 'LEAD', 'CODE': 'CMDT_PDY','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_PDY&fdtc=2'}) # 납 inputs.append({'NAME': 'NICKEL', 'CODE': 'CMDT_NDY','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_NDY&fdtc=2'}) # 니켈 inputs.append({'NAME': 'SUGAR', 'CODE': 'CMDT_SB','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_SB&fdtc=2'}) # 설탕 inputs.append({'NAME': 'COFFEE', 'CODE': 'CMDT_KC','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_KC&fdtc=2'}) # 커피 inputs.append({'NAME': 'COCOA', 'CODE': 'CMDT_CC','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_CC&fdtc=2'}) # 코코아 inputs.append( {'NAME':'GOLD', 'CODE':'CMDT_GC', 'URL':'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_GC&fdtc=2'} ) # 국제 금 inputs.append({'NAME': 'SILVER', 'CODE': 'CMDT_SI','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_SI&fdtc=2'}) # 국제은 inputs.append({'NAME': 'PLATINUM', 'CODE': 'CMDT_PL','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_PL&fdtc=2'}) # 국제 백금 inputs.append({'NAME': 'PALADIUM', 'CODE': 'CMDT_PA','URL': 'http://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_PA&fdtc=2'}) # 국제 팔라듐 for idx in range(len(inputs)): input = inputs[idx] NAME = input['NAME'] CODE = input['CODE'] cursor.execute('SELECT ymd FROM ' + tableName + ' WHERE CODE=? order by ymd desc', (CODE,)) result = cursor.fetchone() if result == None: lastDay = self.START_DATE else: lastDay = result[0] finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) html = None while True: try: html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) sleep(0.5) break except: print(input['URL'] + '&page=%s' % i) if i>200: break continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: break for j in range(0, len(html[0].values)): item = html[0].values[j] if item[0] <= lastDay: finish = True break ymd = item[0] # 날짜 close = item[1] # 종가 diff = item[2] # 전일대비 rate = item[3] # 등락율 cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=? and ymd=?', (CODE,ymd,)) result = cursor.fetchone() if result == None: cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, ymd, close, diff, rate) VALUES(?, ?, ?, ?, ?, ?)", (CODE, NAME, ymd, close, diff, rate)) else: # cursor.execute("UPDATE " + tableName + " SET close=?, diff=?, rate=? WHERE CODE=? and ymd=?", (close, diff, rate, CODE, ymd)) finish = True break print("{}. {} {} ({})".format(idx, ymd, CODE, NAME)) if finish: break conn.commit() cursor.close() conn.close() return if __name__ == "__main__": PROJECT_HOME = os.path.join(os.path.dirname(os.path.join(os.path.dirname(os.path.join(os.path.dirname(os.path.join(os.path.dirname(__file__)))))))) metaCrawler = MetaCrawler() print("\n[환율 (USD, JPY, EUR, CNY)]") inFileName = PROJECT_HOME + '/resources/stock.db' metaCrawler.crawl_exchange(inFileName) print("\n[투자자별 매매동향(Trading_Trend)]") inFileName = PROJECT_HOME + '/resources/stock.db' metaCrawler.crawl_trading_trend(inFileName) print("\n[증시자금동향 (신용잔고, 펀드자금 잔고)]") inFileName = PROJECT_HOME + '/resources/stock.db' metaCrawler.crawl_money_trend(inFileName) print("\n[국내 시장금리]") inFileName = PROJECT_HOME + '/resources/stock.db' metaCrawler.crawl_interest_rates(inFileName) print("\n[원유 (WTI), 국제금, COPPER, NATURALGAS, CORN, SOYBEAN]") inFileName = PROJECT_HOME + '/resources/stock.db' metaCrawler.crawl_meterials(inFileName)