# https://bigdata-sk.tistory.com/10 import pandas as pd import re import json import os import requests class Queue(object): def __init__(self, max): self.queue = [] self.max = max def dequeue(self): length = len(self.queue) if length == 0 or length < self.max: return -1 return self.queue.pop(0) def enqueue(self, n): length = len(self.queue) if length == self.max: self.dequeue() self.queue.append(n) pass def sum(self): sum = 0 for item in self.queue: sum += item return sum def avg(self): length = len(self.queue) total = self.sum() return round(total / length) def print(self): print(self.sum(), self.queue) # 닐짜 형식으로 바뀐 this_date값을 확인 가능 # 읽어온 날짜 정보를 date형식으로 바꿀 일이 계속 생기므로 이 기능을 함수로 정의해줌. # 함수명은 date_format() class StockCrawler: header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'} historical_prices = None special_pattern = None fnGuideCrawler = None limit_page_count = 40 def __init__(self): self.historical_prices = dict() self.special_pattern = ( '[', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', ',', '.', '?', '"', ':', ';', '{', '}', '|', '<', '>', ']', '+', '-', '/', '=', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9') return def clean_str(self, string): string = re.sub(r"\\", " ", string) string = re.sub(r"\'", " ", string) string = re.sub(r"\"", " ", string) string = re.sub(r"`", " ", string) string = re.sub(r"-", " ", string) string = re.sub(r"\(.*?\)", " ", string) string = re.sub(r" ", " ", string) return string.strip().lower() def getStockInfo(self): #code_df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0] code_df = pd.read_html(requests.get('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', headers=self.header).text)[0] # code_df = pd.read_excel('../resources/stock/상장법인목록.xls') # 종목코드가 6자리이기 때문에 6자리를 맞춰주기 위해 설정해줌 code_df.종목코드 = code_df.종목코드.map('{:06d}'.format) # 우리가 필요한 것은 회사명과 종목코드이기 때문에 필요없는 column들은 제외해준다. code_df = code_df[['회사명', '종목코드']] # 한글로된 컬럼명을 영어로 바꿔준다. code_df = code_df.rename(columns={'회사명': 'name', '종목코드': 'code'}) ###print (code_df.head()) return code_df # 종목 이름을 입력하면 종목에 해당하는 코드를 불러와 # 네이버 금융(http://finance.naver.com)에 넣어줌 def get_url(self, item_name, code_df): code = code_df.query("name=='{}'".format(item_name))['code'].to_string(index=False).strip() url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=code.strip()) return code, url def date_format(slef, d): d = str(d).replace('-', '.') #yyyy = int(d.split('.')[0]) #mm = int(d.split('.')[1]) #dd = int(d.split('.')[2]) #this_date = dt.date(yyyy, mm, dd) return d def getCodeIndex(self, stocks, item_code): for i, stock in enumerate(stocks): if item_code == stock['CODE']: return i return -1 def crawl_stocks(self, inFileName): stocks = [] if os.path.isfile(inFileName): inFp = open(inFileName, 'r', encoding='utf-8') for line in inFp.readlines(): line = line.strip() if line: stocks.append(json.loads(line)) inFp.close() if len(stocks)>0: stock_cosdak_inverse = {"NAME": 'KODEX 코스닥150선물인버스', "CODE": "251340", "PRICE": stocks[0]["PRICE"]} stock_cosdak_reverage = {"NAME": 'KODEX 코스닥150 레버리지', "CODE": "233740", "PRICE": stocks[1]["PRICE"]} stock_inverse = {"NAME": 'KODEX 200선물인버스2X', "CODE": "252670", "PRICE": stocks[2]["PRICE"]} stock_reverage = {"NAME": 'KODEX 레버리지', "CODE": "122630", "PRICE": stocks[3]["PRICE"]} stock_gold = {"NAME": 'KODEX 골드선물(H)', "CODE": "132030", "PRICE": stocks[4]["PRICE"]} else: stock_cosdak_inverse = {"NAME": 'KODEX 코스닥150선물인버스', "CODE": "251340", "PRICE": []} stock_cosdak_reverage = {"NAME": 'KODEX 코스닥150 레버리지', "CODE": "233740", "PRICE": []} stock_inverse = {"NAME": 'KODEX 200선물인버스2X', "CODE": "252670", "PRICE": []} stock_reverage = {"NAME": 'KODEX 레버리지', "CODE": "122630", "PRICE": []} stock_gold = {"NAME": 'KODEX 골드선물(H)', "CODE": "132030", "PRICE": []} outFp = open(inFileName, "w", encoding="utf-8") kodex_cosdak_inverse = self.crawl_specific_stock('KODEX 코스닥150선물인버스', '251340', stock_cosdak_inverse) outFp.write(json.dumps(kodex_cosdak_inverse, ensure_ascii=False) + "\n") kodex_cosdak_reverage = self.crawl_specific_stock('KODEX 코스닥150 레버리지', '233740', stock_cosdak_reverage) outFp.write(json.dumps(kodex_cosdak_reverage, ensure_ascii=False) + "\n") kodex_inverse = self.crawl_specific_stock('KODEX 200선물인버스2X', '252670', stock_inverse) outFp.write(json.dumps(kodex_inverse, ensure_ascii=False) + "\n") kodex_reverage = self.crawl_specific_stock('KODEX 레버리지', '122630', stock_reverage) outFp.write(json.dumps(kodex_reverage, ensure_ascii=False) + "\n") kodex_gold = self.crawl_specific_stock('KODEX 골드선물(H)', '132030', stock_gold) outFp.write(json.dumps(kodex_gold, ensure_ascii=False) + "\n") code_df = self.getStockInfo() items = code_df.values idx = 0 for item in items: idx += 1 item_name = item[0] item_code = item[1] print(idx, item_name, item_code) if len(stocks) > 0: index = self.getCodeIndex(stocks, item_code) if index < 0: stock = {"NAME": item_name, "CODE": item_code, "PRICE": []} else: stock = {"NAME": item_name, "CODE": item_code, "PRICE": stocks[index]["PRICE"]} else: stock = {"NAME": item_name, "CODE": item_code, "PRICE": []} stock = self.crawl_specific_stock(item_name, item_code, stock) outFp.write(json.dumps(stock, ensure_ascii=False) + "\n") outFp.close() return def get_stocks_avg(self, inFileName, outFileName): outFp = open(outFileName, 'w', encoding='utf-8') inFp = open(inFileName, 'r', encoding='utf-8') idx = 0 for line in inFp.readlines(): idx += 1 line = line.strip() if line: jsonData = json.loads(line) q_3 = Queue(3) q_5 = Queue(5) q_7 = Queue(7) q_10 = Queue(10) q_20 = Queue(20) q_30 = Queue(30) q_60 = Queue(60) q_90 = Queue(90) q_100 = Queue(100) q_120 = Queue(120) q_150 = Queue(150) q_180 = Queue(180) q_200 = Queue(200) q_240 = Queue(240) for item in jsonData["PRICE"]: q_3.enqueue(item['close']) q_5.enqueue(item['close']) q_7.enqueue(item['close']) q_10.enqueue(item['close']) q_20.enqueue(item['close']) q_30.enqueue(item['close']) q_60.enqueue(item['close']) q_90.enqueue(item['close']) q_100.enqueue(item['close']) q_120.enqueue(item['close']) q_150.enqueue(item['close']) q_180.enqueue(item['close']) q_200.enqueue(item['close']) q_240.enqueue(item['close']) item['avg3'] = q_3.avg() item['avg5'] = q_5.avg() item['avg7'] = q_7.avg() item['avg10'] = q_10.avg() item['avg20'] = q_20.avg() item['avg30'] = q_30.avg() item['avg60'] = q_60.avg() item['avg90'] = q_90.avg() item['avg100'] = q_100.avg() item['avg120'] = q_120.avg() item['avg150'] = q_150.avg() item['avg180'] = q_180.avg() item['avg200'] = q_200.avg() item['avg240'] = q_240.avg() outFp.write(json.dumps(jsonData, ensure_ascii=False) + "\n") inFp.close() outFp.close() return def crawl_specific_stock(self, code_name, code, stock): item_name = code_name item_code = code url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=item_code.strip()) # 일자 데이터를 담을 df라는 DataFrame 정의 df = pd.DataFrame() lastDay = "" if len(stock) > 0 and len(stock["PRICE"])-1 > 0: lastDay = stock["PRICE"][len(stock["PRICE"])-1]["DATE"].replace("-",".") lastPage = False # 1페이지에서 1000페이지의 데이터만 가져오기 for page in range(1, self.limit_page_count): # 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다. pg_url = '{url}&page={page}'.format(url=url, page=page) #html = pd.read_html(pg_url, header=0) html = pd.read_html(requests.get(pg_url, headers=self.header).text) count = 0 for date in html[0].날짜.values: if type(date) is str: count += 1 if date == lastDay: lastPage = True df = df.append(html[0], ignore_index=True) break if count == 10: df = df.append(html[0], ignore_index=True) else: if lastPage == False: df = df.append(html[0], ignore_index=True) lastPage = True else: break # df.dropna()를 이용해 결측값 있는 행 제거 df = df.dropna() # 상위 5개 데이터 확인하기 ###print (df.head()) # 한글로 된 컬럼명을 영어로 바꿔줌 df = df.rename(columns={'날짜': 'date', '종가': 'close', '전일비': 'diff', '시가': 'open', '고가': 'high', '저가': 'low', '거래량': 'volume'}) # 데이터의 타입을 int형으로 바꿔줌 df[['close', 'diff', 'open', 'high', 'low', 'volume']] = df[['close', 'diff', 'open', 'high', 'low', 'volume']].astype(int) # 컬럼명 'date'의 타입을 date로 바꿔줌 df['date'] = pd.to_datetime(df['date']) # 일자(date)를 기준으로 오름차순 정렬 #df = df.sort_values(by=['date'], ascending=True) # 상위 5개 데이터 확인 ###print (df.head()) if len(stock) > 0 and len(stock["PRICE"]) - 1 > 0: lastDay = stock["PRICE"][len(stock["PRICE"])-1]["DATE"] for values in df.values: day = str(values[0]).split(' ')[0] if lastDay == day: break stock["PRICE"].append({ "DATE": day, df.columns[1]: values[1], df.columns[2]: values[2], df.columns[3]: values[3], df.columns[4]: values[4], df.columns[5]: values[5], df.columns[6]: values[6], }) #stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE'], reverse=True) stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE']) return stock def update_stocks(self, inFileName): stock_inverse = {"NAME": 'KODEX 200선물인버스2X', "CODE": "252670", "PRICE": []} stock_reverage = {"NAME": 'KODEX 레버리지', "CODE": "122630", "PRICE": []} stock_gold = {"NAME": 'KODEX 골드선물(H)', "CODE": "132030", "PRICE": []} stocks = [] if os.path.isfile(inFileName): inFp = open(inFileName, 'r', encoding='utf-8') for line in inFp.readlines(): line = line.strip() if line: jsonData = json.loads(line) jsonData["PRICE"] = sorted(jsonData["PRICE"], key=lambda x: x['DATE'], reverse=True) if jsonData['CODE'] == "252670": stock_inverse = jsonData elif jsonData['CODE'] == "122630": stock_reverage = jsonData elif jsonData['CODE'] == "132030": stock_gold = jsonData else: stocks.append(jsonData) inFp.close() outFp = open(inFileName, 'w', encoding='utf-8') if len(stocks) == 0: limit_page_count = 1000 code_df = self.getStockInfo() stocks = code_df.values else: limit_page_count = 2 code_df = None idx = 0 for item in stocks: idx += 1 if limit_page_count == 1000: item_name = item[0] item_code = item[1] print(idx, item_name) stock = {"NAME": item_name, "CODE": item_code, "PRICE": []} code, url = self.get_url(item_name, code_df) else: item_name = item['NAME'] item_code = item['CODE'] print(idx, item_name) stock = {"NAME": item_name, "CODE": item_code, "PRICE": []} url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=item_code.strip()) # 일자 데이터를 담을 df라는 DataFrame 정의 df = pd.DataFrame() lastPage = False # 1페이지에서 1000페이지의 데이터만 가져오기 for page in range(1, limit_page_count): # 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다. pg_url = '{url}&page={page}'.format(url=url, page=page) #html = pd.read_html(pg_url, header=0) html = pd.read_html(requests.get(pg_url, headers=self.header).text) count = 0 for date in html[0].날짜.values: if type(date) is str: count += 1 if count == 10: df = df.append(html[0], ignore_index=True) else: if lastPage == False: df = df.append(html[0], ignore_index=True) lastPage = True else: break # df.dropna()를 이용해 결측값 있는 행 제거 df = df.dropna() # 상위 5개 데이터 확인하기 ###print (df.head()) # 한글로 된 컬럼명을 영어로 바꿔줌 df = df.rename(columns={'날짜': 'date', '종가': 'close', '전일비': 'diff', '시가': 'open', '고가': 'high', '저가': 'low', '거래량': 'volume'}) # 데이터의 타입을 int형으로 바꿔줌 df[['close', 'diff', 'open', 'high', 'low', 'volume']] = df[['close', 'diff', 'open', 'high', 'low', 'volume']].astype(int) # 컬럼명 'date'의 타입을 date로 바꿔줌 df['date'] = pd.to_datetime(df['date']) # 일자(date)를 기준으로 오름차순 정렬 # df = df.sort_values(by=['date'], ascending=False) # 상위 5개 데이터 확인 ###print (df.head()) q_3 = Queue(3) q_5 = Queue(5) q_7 = Queue(7) q_10 = Queue(10) q_20 = Queue(20) q_30 = Queue(30) q_60 = Queue(60) q_90 = Queue(90) q_100 = Queue(100) q_120 = Queue(120) q_150 = Queue(150) q_180 = Queue(180) q_200 = Queue(200) q_240 = Queue(240) if limit_page_count == 1000: for values in df.values: q_3.enqueue(values[1]) q_5.enqueue(values[1]) q_7.enqueue(values[1]) q_10.enqueue(values[1]) q_20.enqueue(values[1]) q_30.enqueue(values[1]) q_60.enqueue(values[1]) q_90.enqueue(values[1]) q_100.enqueue(values[1]) q_120.enqueue(values[1]) q_150.enqueue(values[1]) q_180.enqueue(values[1]) q_200.enqueue(values[1]) q_240.enqueue(values[1]) stock["PRICE"].append({ "DATE": str(values[0]).split(' ')[0], df.columns[1]: values[1], df.columns[2]: values[2], df.columns[3]: values[3], df.columns[4]: values[4], df.columns[5]: values[5], df.columns[6]: values[6], 'avg3': q_3.avg(), 'avg5': q_5.avg(), 'avg7': q_7.avg(), 'avg10': q_10.avg(), 'avg20': q_20.avg(), 'avg30': q_30.avg(), 'avg60': q_60.avg(), 'avg90': q_90.avg(), 'avg100': q_100.avg(), 'avg120': q_120.avg(), 'avg150': q_150.avg(), 'avg180': q_180.avg(), 'avg200': q_200.avg(), 'avg240': q_240.avg() }) else: for values in item["PRICE"]: q_3.enqueue(values["close"]) q_5.enqueue(values["close"]) q_7.enqueue(values["close"]) q_10.enqueue(values["close"]) q_20.enqueue(values["close"]) q_30.enqueue(values["close"]) q_60.enqueue(values["close"]) q_90.enqueue(values["close"]) q_100.enqueue(values["close"]) q_120.enqueue(values["close"]) q_150.enqueue(values["close"]) q_180.enqueue(values["close"]) q_200.enqueue(values["close"]) q_240.enqueue(values["close"]) # 기존 파일에서 읽은 것 stock["PRICE"].append({ "DATE": str(values["DATE"]).split(' ')[0], df.columns[1]: values["close"], df.columns[2]: values["diff"], df.columns[3]: values["open"], df.columns[4]: values["high"], df.columns[5]: values["low"], df.columns[6]: values["volume"], 'avg3': q_5.avg(), 'avg5': q_5.avg(), 'avg7': q_5.avg(), 'avg10': q_10.avg(), 'avg20': q_20.avg(), 'avg30': q_30.avg(), 'avg60': q_60.avg(), 'avg90': q_90.avg(), 'avg100': q_100.avg(), 'avg120': q_120.avg(), 'avg150': q_150.avg(), 'avg180': q_180.avg(), 'avg200': q_200.avg(), 'avg240': q_240.avg() }) if limit_page_count != 1000: # 새로 웹에서 수집한 것 for values in df.values: date = str(values[0]).split(' ')[0] isExist = False for i in range(len(stock["PRICE"])): if (stock["PRICE"][i]['DATE'] == date): stock["PRICE"][i][df.columns[1]] = values[1] stock["PRICE"][i][df.columns[2]] = values[2] stock["PRICE"][i][df.columns[3]] = values[3] stock["PRICE"][i][df.columns[4]] = values[4] stock["PRICE"][i][df.columns[5]] = values[5] stock["PRICE"][i][df.columns[6]] = values[6] isExist = True break # 새로운 데이터나 오늘 날짜의 데이터 if not isExist: q_3.enqueue(values[1]) q_5.enqueue(values[1]) q_7.enqueue(values[1]) q_10.enqueue(values[1]) q_20.enqueue(values[1]) q_30.enqueue(values[1]) q_60.enqueue(values[1]) q_90.enqueue(values[1]) q_100.enqueue(values[1]) q_120.enqueue(values[1]) q_150.enqueue(values[1]) q_180.enqueue(values[1]) q_200.enqueue(values[1]) q_240.enqueue(values[1]) stock["PRICE"].append({ "DATE": str(values[0]).split(' ')[0], df.columns[1]: values[1], df.columns[2]: values[2], df.columns[3]: values[3], df.columns[4]: values[4], df.columns[5]: values[5], df.columns[6]: values[6], 'avg3': q_3.avg(), 'avg5': q_5.avg(), 'avg7': q_7.avg(), 'avg10': q_10.avg(), 'avg20': q_20.avg(), 'avg30': q_30.avg(), 'avg60': q_60.avg(), 'avg90': q_90.avg(), 'avg100': q_100.avg(), 'avg120': q_120.avg(), 'avg150': q_150.avg(), 'avg180': q_180.avg(), 'avg200': q_200.avg(), 'avg240': q_240.avg() }) stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE'], reverse=True) outFp.write(json.dumps(stock, ensure_ascii=False)+"\n") kodex_inverse = self.crawl_specific_stock('KODEX 200선물인버스2X', '252670', stock_inverse) outFp.write(json.dumps(kodex_inverse, ensure_ascii=False) + "\n") kodex_reverage = self.crawl_specific_stock('KODEX 레버리지', '122630', stock_reverage) outFp.write(json.dumps(kodex_reverage, ensure_ascii=False) + "\n") kodex_gold = self.crawl_specific_stock('KODEX 골드선물(H)', '132030', stock_gold) outFp.write(json.dumps(kodex_gold, ensure_ascii=False) + "\n") outFp.close() return def update_specific_stock(self, code_name, code, stock): item_name = code_name item_code = code print(item_name) if len(stock["PRICE"]) == 0: limit_page_count = 1000 else: limit_page_count = 2 url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=item_code.strip()) # 일자 데이터를 담을 df라는 DataFrame 정의 df = pd.DataFrame() lastPage = False # 1페이지에서 1000페이지의 데이터만 가져오기 for page in range(1, limit_page_count): # 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다. pg_url = '{url}&page={page}'.format(url=url, page=page) #html = pd.read_html(pg_url, header=0) html = pd.read_html(requests.get(pg_url, headers=self.header).text) count = 0 for date in html[0].날짜.values: if type(date) is str: count += 1 if count == 10: df = df.append(html[0], ignore_index=True) else: if lastPage == False: df = df.append(html[0], ignore_index=True) lastPage = True else: break # df.dropna()를 이용해 결측값 있는 행 제거 df = df.dropna() # 상위 5개 데이터 확인하기 ###print (df.head()) # 한글로 된 컬럼명을 영어로 바꿔줌 df = df.rename(columns={'날짜': 'date', '종가': 'close', '전일비': 'diff', '시가': 'open', '고가': 'high', '저가': 'low', '거래량': 'volume'}) # 데이터의 타입을 int형으로 바꿔줌 df[['close', 'diff', 'open', 'high', 'low', 'volume']] = df[['close', 'diff', 'open', 'high', 'low', 'volume']].astype(int) # 컬럼명 'date'의 타입을 date로 바꿔줌 df['date'] = pd.to_datetime(df['date']) # 일자(date)를 기준으로 오름차순 정렬 #df = df.sort_values(by=['date'], ascending=True) # 상위 5개 데이터 확인 ###print (df.head()) q_3 = Queue(3) q_5 = Queue(5) q_7 = Queue(7) q_10 = Queue(10) q_20 = Queue(20) q_30 = Queue(30) q_60 = Queue(60) q_90 = Queue(90) q_100 = Queue(100) q_120 = Queue(120) q_150 = Queue(150) q_180 = Queue(180) q_200 = Queue(200) q_240 = Queue(240) for values in df.values: q_3.enqueue(values[1]) q_5.enqueue(values[1]) q_7.enqueue(values[1]) q_10.enqueue(values[1]) q_20.enqueue(values[1]) q_30.enqueue(values[1]) q_60.enqueue(values[1]) q_90.enqueue(values[1]) q_100.enqueue(values[1]) q_120.enqueue(values[1]) q_150.enqueue(values[1]) q_180.enqueue(values[1]) q_200.enqueue(values[1]) q_240.enqueue(values[1]) stock["PRICE"].append({ "DATE": str(values[0]).split(' ')[0], df.columns[1]: values[1], df.columns[2]: values[2], df.columns[3]: values[3], df.columns[4]: values[4], df.columns[5]: values[5], df.columns[6]: values[6], 'avg3': q_3.avg(), 'avg5': q_5.avg(), 'avg7': q_7.avg(), 'avg10': q_10.avg(), 'avg20': q_20.avg(), 'avg30': q_30.avg(), 'avg60': q_60.avg(), 'avg90': q_90.avg(), 'avg100': q_100.avg(), 'avg120': q_120.avg(), 'avg150': q_150.avg(), 'avg180': q_180.avg(), 'avg200': q_200.avg(), 'avg240': q_240.avg() }) stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE'], reverse=True) return stock