Files
DeepStock/stockpredictor/crawler/toSQLite/StockCrawler.py
dsyoon 9e6f44b5ab init
2021-02-23 06:14:49 +09:00

410 lines
13 KiB
Python

# https://bigdata-sk.tistory.com/10
import pandas as pd
import re
import json
import sqlite3
import requests
class Queue(object):
def __init__(self, max):
self.queue = []
self.max = max
def dequeue(self):
length = len(self.queue)
if length == 0 or length < self.max:
return -1
return self.queue.pop(0)
def enqueue(self, n):
length = len(self.queue)
if length == self.max:
self.dequeue()
self.queue.append(n)
pass
def sum(self):
sum = 0
for item in self.queue:
sum += item
return sum
def avg(self):
length = len(self.queue)
total = self.sum()
return round(total / length)
def print(self):
print(self.sum(), self.queue)
# 닐짜 형식으로 바뀐 this_date값을 확인 가능
# 읽어온 날짜 정보를 date형식으로 바꿀 일이 계속 생기므로 이 기능을 함수로 정의해줌.
# 함수명은 date_format()
class StockCrawler:
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
historical_prices = None
special_pattern = None
fnGuideCrawler = None
limit_page_count = 10000
def __init__(self):
self.historical_prices = dict()
self.special_pattern = (
'[', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', ',', '.', '?', '"', ':', ';', '{', '}', '|', '<', '>',
']', '+', '-', '/', '=', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9')
return
def clean_str(self, string):
string = re.sub(r"\\", " ", string)
string = re.sub(r"\'", " ", string)
string = re.sub(r"\"", " ", string)
string = re.sub(r"`", " ", string)
string = re.sub(r"-", " ", string)
string = re.sub(r"\(.*?\)", " ", string)
string = re.sub(r" ", " ", string)
return string.strip().lower()
def getStockInfo(self):
#code_df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0]
code_df = pd.read_html(requests.get('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', headers=self.header).text)[0]
# code_df = pd.read_excel('../resources/stock/상장법인목록.xls')
# 종목코드가 6자리이기 때문에 6자리를 맞춰주기 위해 설정해줌
code_df.종목코드 = code_df.종목코드.map('{:06d}'.format)
# 우리가 필요한 것은 회사명과 종목코드이기 때문에 필요없는 column들은 제외해준다.
code_df = code_df[['회사명', '종목코드']]
# 한글로된 컬럼명을 영어로 바꿔준다.
code_df = code_df.rename(columns={'회사명': 'name', '종목코드': 'code'})
###print (code_df.head())
return code_df
# 종목 이름을 입력하면 종목에 해당하는 코드를 불러와
# 네이버 금융(http://finance.naver.com)에 넣어줌
def get_url(self, item_name, code_df):
code = code_df.query("name=='{}'".format(item_name))['code'].to_string(index=False).strip()
url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=code.strip())
return code, url
def date_format(slef, d):
d = str(d).replace('-', '.')
#yyyy = int(d.split('.')[0])
#mm = int(d.split('.')[1])
#dd = int(d.split('.')[2])
#this_date = dt.date(yyyy, mm, dd)
return d
def getCodeIndex(self, stocks, item_code):
for i, stock in enumerate(stocks):
if item_code == stock['CODE']:
return i
return -1
def crawl_etf_stocks(self, inFileName):
tableName = 'stock'
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text PRIMARY KEY, NAME text, PRICE text)")
stocks = []
stocks.append({"NAME": 'KODEX 코스닥150선물인버스', "CODE": "251340", "PRICE": []})
stocks.append({"NAME": 'KODEX 코스닥150 레버리지', "CODE": "233740", "PRICE": []})
stocks.append({"NAME": 'KODEX 200선물인버스2X', "CODE": "252670", "PRICE": []})
stocks.append({"NAME": 'KODEX 레버리지', "CODE": "122630", "PRICE": []})
stocks.append({"NAME": 'KODEX 골드선물(H)', "CODE": "132030", "PRICE": []})
for stock in stocks:
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result != None:
stock["PRICE"] = json.loads(result[2])
self.crawl_specific_stock(stock)
text = json.dumps(stock['PRICE'], ensure_ascii=False)
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, PRICE) VALUES(?, ?, ?)", (stock["CODE"], stock["NAME"], text))
else:
cursor.execute("UPDATE " + tableName + " SET PRICE=? WHERE CODE=?", (text, stock["CODE"]))
conn.commit()
cursor.close()
conn.close()
return
def crawl_stocks(self, inFileName):
tableName = 'stock'
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text PRIMARY KEY, NAME text, PRICE text)")
code_df = self.getStockInfo()
items = code_df.values
idx = 0
for item in items:
idx += 1
item_name = item[0]
item_code = item[1]
print(idx, item_name, item_code)
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (item_code,))
result = cursor.fetchone()
stock = {"CODE": item_code, "NAME": item_name, "PRICE": []}
if result != None:
stock["PRICE"] = json.loads(result[2])
self.crawl_specific_stock(stock)
text = json.dumps(stock['PRICE'], ensure_ascii=False)
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, PRICE) VALUES(?, ?, ?)", (stock["CODE"], stock["NAME"], text))
else:
cursor.execute("UPDATE " + tableName + " SET PRICE=? WHERE CODE=?", (text, stock["CODE"]))
conn.commit()
cursor.close()
conn.close()
return
def get_data(self, stock):
url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=stock['CODE'].strip())
# 일자 데이터를 담을 df라는 DataFrame 정의
df = pd.DataFrame()
lastDay = ""
if len(stock) > 0 and len(stock["PRICE"]) - 1 > 0:
lastDay = stock["PRICE"][len(stock["PRICE"]) - 1]["DATE"].replace("-", ".")
lastPage = False
# 1페이지에서 1000페이지의 데이터만 가져오기
for page in range(1, self.limit_page_count):
# 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다.
pg_url = '{url}&page={page}'.format(url=url, page=page)
#html = pd.read_html(pg_url, header=0)
html = pd.read_html(requests.get(pg_url, headers=self.header).text)
count = 0
for date in html[0].날짜.values:
if type(date) is str:
count += 1
if date == lastDay:
lastPage = True
df = df.append(html[0], ignore_index=True)
break
if count == 10:
df = df.append(html[0], ignore_index=True)
else:
if lastPage == False:
df = df.append(html[0], ignore_index=True)
lastPage = True
else:
break
# df.dropna()를 이용해 결측값 있는 행 제거
df = df.dropna()
# 상위 5개 데이터 확인하기
###print (df.head())
# 한글로 된 컬럼명을 영어로 바꿔줌
df = df.rename(columns={'날짜': 'date', '종가': 'close', '전일비': 'diff', '시가': 'open', '고가': 'high', '저가': 'low', '거래량': 'volume'})
# 데이터의 타입을 int형으로 바꿔줌
df[['close', 'diff', 'open', 'high', 'low', 'volume']] = df[['close', 'diff', 'open', 'high', 'low', 'volume']].astype(int)
# 컬럼명 'date'의 타입을 date로 바꿔줌
df['date'] = pd.to_datetime(df['date'])
# 일자(date)를 기준으로 오름차순 정렬
# df = df.sort_values(by=['date'], ascending=True)
# 상위 5개 데이터 확인
###print (df.head())
if len(stock) > 0 and len(stock["PRICE"]) - 1 > 0:
lastDay = stock["PRICE"][len(stock["PRICE"]) - 1]["DATE"]
for values in df.values:
day = str(values[0]).split(' ')[0]
if lastDay == day:
break
stock["PRICE"].append({
"DATE": day,
df.columns[1]: values[1],
df.columns[2]: values[2],
df.columns[3]: values[3],
df.columns[4]: values[4],
df.columns[5]: values[5],
df.columns[6]: values[6],
})
# stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE'], reverse=True)
stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE'])
return
def get_moving_avg(self, stock):
q_3 = Queue(3)
q_5 = Queue(5)
q_7 = Queue(7)
q_10 = Queue(10)
q_20 = Queue(20)
q_30 = Queue(30)
q_60 = Queue(60)
q_90 = Queue(90)
q_100 = Queue(100)
q_120 = Queue(120)
q_150 = Queue(150)
q_180 = Queue(180)
q_200 = Queue(200)
q_240 = Queue(240)
for i in range(len(stock['PRICE'])):
q_3.enqueue(stock['PRICE'][i]['close'])
q_5.enqueue(stock['PRICE'][i]['close'])
q_7.enqueue(stock['PRICE'][i]['close'])
q_10.enqueue(stock['PRICE'][i]['close'])
q_20.enqueue(stock['PRICE'][i]['close'])
q_30.enqueue(stock['PRICE'][i]['close'])
q_60.enqueue(stock['PRICE'][i]['close'])
q_90.enqueue(stock['PRICE'][i]['close'])
q_100.enqueue(stock['PRICE'][i]['close'])
q_120.enqueue(stock['PRICE'][i]['close'])
q_150.enqueue(stock['PRICE'][i]['close'])
q_180.enqueue(stock['PRICE'][i]['close'])
q_200.enqueue(stock['PRICE'][i]['close'])
q_240.enqueue(stock['PRICE'][i]['close'])
stock['PRICE'][i]['avg3'] = q_3.avg()
stock['PRICE'][i]['avg5'] = q_5.avg()
stock['PRICE'][i]['avg7'] = q_7.avg()
stock['PRICE'][i]['avg10'] = q_10.avg()
stock['PRICE'][i]['avg20'] = q_20.avg()
stock['PRICE'][i]['avg30'] = q_30.avg()
stock['PRICE'][i]['avg60'] = q_60.avg()
stock['PRICE'][i]['avg90'] = q_90.avg()
stock['PRICE'][i]['avg100'] = q_100.avg()
stock['PRICE'][i]['avg120'] = q_120.avg()
stock['PRICE'][i]['avg150'] = q_150.avg()
stock['PRICE'][i]['avg180'] = q_180.avg()
stock['PRICE'][i]['avg200'] = q_200.avg()
stock['PRICE'][i]['avg240'] = q_240.avg()
return
def crawl_specific_stock(self, stock):
# 데이터 수집
self.get_data(stock)
# 이동 평균 계산
self.get_moving_avg(stock)
return
def update(self, inFileName, outFileName):
"""
Full json 데이터를 db에 import 시킴
inFileName = PROJECT_HOME + '/resources/stock.json.full'
outFileName = PROJECT_HOME + '/resources/stock.db'
crawler = StockCrawler()
crawler.update(inFileName, outFileName)
:param inFileName:
:param outFileName:
:return:
"""
tableName = 'stock'
conn = sqlite3.connect(outFileName, isolation_level=None)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text PRIMARY KEY, NAME text, PRICE text)")
idx = 0
inFp = open(inFileName, 'r')
for line in inFp.readlines():
if line:
idx += 1
stock = json.loads(line)
print(idx, stock["CODE"], stock["NAME"])
text = json.dumps(stock["PRICE"], ensure_ascii=False)
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, PRICE) VALUES(?, ?, ?)", (stock["CODE"], stock["NAME"], text))
else:
cursor.execute("UPDATE " + tableName + " SET PRICE=? WHERE CODE=?", (text, stock["CODE"]))
return
def saveIndex(self, code, inFileName, outFileName):
tableName = 'stock'
conn = sqlite3.connect(outFileName)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text PRIMARY KEY, NAME text, PRICE text)")
stock = {"NAME": code, "CODE": code, "PRICE": []}
lastDay = ""
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result != None:
stock["PRICE"] = json.loads(result[2])
lastDay = stock["PRICE"][len(stock["PRICE"]) - 1]["DATE"]
with open(inFileName, "r", encoding="utf-8") as inFp:
for line in inFp:
line = line.strip()
if line[0] == "#":
continue
arr = line.split("\t")
if arr[0] == lastDay:
break
price = {"DATE": arr[0], "close": float(arr[1]), "diff": float(arr[6].replace("%", "")), "open": float(arr[2]), "high": float(arr[3]), "low": float(arr[4]), "volume": 0}
price['avg3'] = 0
price['avg5'] = 0
price['avg7'] = 0
price['avg10'] = 0
price['avg20'] = 0
price['avg30'] = 0
price['avg60'] = 0
price['avg90'] = 0
price['avg100'] = 0
price['avg120'] = 0
price['avg150'] = 0
price['avg180'] = 0
price['avg200'] = 0
price['avg240'] = 0
stock["PRICE"].append(price)
stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE'])
self.get_moving_avg(stock)
text = json.dumps(stock['PRICE'], ensure_ascii=False)
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, PRICE, MACD, STOCHASTIC, ICHIMOKU, RSI) VALUES(?, ?, ?, ?, ?, ?, ?)", (stock["CODE"], stock["NAME"], text, "[{}]", "[{}]", "[{}]", "[{}]"))
else:
cursor.execute("UPDATE " + tableName + " SET PRICE=?, MACD=?, STOCHASTIC=?, ICHIMOKU=?, RSI=? WHERE CODE=?", (text, "[{}]", "[{}]", "[{}]", "[{}]", stock["CODE"]))
conn.commit()
cursor.close()
conn.close()
return