Files
DeepStock/stock/crawler/StockCrawler.py
dsyoon 682138a8ae init
2023-09-03 10:29:03 +09:00

526 lines
18 KiB
Python

# https://bigdata-sk.tistory.com/10
from datetime import datetime, timedelta
import os
import pandas as pd
import re
import json
import sqlite3
import requests
from time import sleep
import time
from pandas_datareader import data as pdr
import yfinance as yfin
# 닐짜 형식으로 바뀐 this_date값을 확인 가능
# 읽어온 날짜 정보를 date형식으로 바꿀 일이 계속 생기므로 이 기능을 함수로 정의해줌.
# 함수명은 date_format()
class StockCrawler:
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
historical_prices = None
special_pattern = None
fnGuideCrawler = None
limit_page_count = 1000000
START_DATE = None
def __init__(self, START_DATE):
self.historical_prices = dict()
self.special_pattern = (
'[', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', ',', '.', '?', '"', ':', ';', '{', '}', '|', '<', '>',
']', '+', '-', '/', '=', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9')
self.START_DATE = START_DATE
"""
start_day = (datetime.today() - timedelta(weeks=2)).strftime('%Y-%m-%d')
end_day = datetime.today().strftime('%Y-%m-%d')
yfin.pdr_override()
data = pdr.get_data_yahoo("311690.KQ", start_day, end_day, auto_adjust=True, progress=False)
print (data)
"""
return
def clean_str(self, string):
string = re.sub(r"\\", " ", string)
string = re.sub(r"\'", " ", string)
string = re.sub(r"\"", " ", string)
string = re.sub(r"`", " ", string)
string = re.sub(r"-", " ", string)
string = re.sub(r"\(.*?\)", " ", string)
string = re.sub(r" ", " ", string)
return string.strip().lower()
def getStockInfo(self):
#code_df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0]
code_df = pd.read_html(requests.get('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', headers=self.header, timeout=30).text)[0]
# code_df = pd.read_excel('../resources/stock/상장법인목록.xls')
# 종목코드가 6자리이기 때문에 6자리를 맞춰주기 위해 설정해줌
code_df.종목코드 = code_df.종목코드.map('{:06d}'.format)
# 우리가 필요한 것은 회사명과 종목코드이기 때문에 필요없는 column들은 제외해준다.
code_df = code_df[['회사명', '종목코드']]
# 한글로된 컬럼명을 영어로 바꿔준다.
code_df = code_df.rename(columns={'회사명': 'name', '종목코드': 'code'})
###print (code_df.head())
return code_df
# 종목 이름을 입력하면 종목에 해당하는 코드를 불러와
# 네이버 금융(http://finance.naver.com)에 넣어줌
def get_url(self, item_name, code_df):
code = code_df.query("name=='{}'".format(item_name))['code'].to_string(index=False).strip()
url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=code.strip())
return code, url
def date_format(slef, d):
d = str(d).replace('-', '.')
#yyyy = int(d.split('.')[0])
#mm = int(d.split('.')[1])
#dd = int(d.split('.')[2])
#this_date = dt.date(yyyy, mm, dd)
return d
def getCodeIndex(self, stocks, item_code):
for i, stock in enumerate(stocks):
if item_code == stock['CODE']:
return i
return -1
def crawl_etf_stocks(self, inFileName):
tableName = 'stock'
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
# 테이블 생성
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text, NAME text, ymd text, close REAL, diff REAL, open REAL, high REAL, low REAL, volume REAL)")
# 키 생성
create_key = "CREATE INDEX IF NOT EXISTS " + tableName + "_idx on " + tableName + " (CODE, ymd) "
cursor.execute(create_key)
stocks = []
stocks.append({"NAME": 'KODEX 코스닥150선물인버스', "CODE": "251340"})
stocks.append({"NAME": 'KODEX 코스닥150 레버리지', "CODE": "233740"})
stocks.append({"NAME": 'KODEX 200선물인버스2X', "CODE": "252670"})
stocks.append({"NAME": 'KODEX 레버리지', "CODE": "122630"})
stocks.append({"NAME": 'KODEX 인버스', "CODE": "114800"})
stocks.append({"NAME": 'KODEX 중국본토CSI300', "CODE": "283580"})
stocks.append({"NAME": 'KODEX 심천ChiNext(합성)', "CODE": "256750"})
stocks.append({"NAME": 'KINDEX 블룸버그베트남VN30선물레버리지(H)', "CODE": "371130"})
stocks.append({"NAME": 'KODEX 미국S&P바이오(합성)', "CODE": "185680"})
stocks.append({"NAME": 'KODEX 미국S&P에너지(합성)', "CODE": "218420"})
stocks.append({"NAME": 'KODEX 골드선물(H)', "CODE": "132030"})
stocks.append({"NAME": 'KODEX 콩선물(H)', "CODE": "138920"})
stocks.append({"NAME": 'KODEX 3대농산물선물(H)', "CODE": "271060"})
stocks.append({"NAME": 'KODEX 건설', "CODE": "117700"})
stocks.append({"NAME": 'KODEX 헬스케어', "CODE": "266420"})
stocks.append({"NAME": 'KODEX 글로벌4차산업로보틱스(합성)', "CODE": "276990"})
stocks.append({"NAME": 'KODEX 바이오', "CODE": "244580"})
stocks.append({"NAME": 'KODEX 반도체', "CODE": "091160"})
stocks.append({"NAME": 'KODEX 보험', "CODE": "140700"})
stocks.append({"NAME": 'KODEX 필수소비재', "CODE": "266410"})
stocks.append({"NAME": 'KODEX 2차전지산업', "CODE": "305720"})
stocks.append({"NAME": 'KODEX 경기소비재', "CODE": "266390"})
stocks.append({"NAME": 'KODEX 철강', "CODE": "117680"})
stocks.append({"NAME": 'KODEX 에너지화학', "CODE": "117460"})
stocks.append({"NAME": 'KODEX 은행', "CODE": "091170"})
stocks.append({"NAME": 'TIGER 탄소효율그린뉴딜', "CODE": "376410"})
start_time = time.time()
for i, stock in enumerate(stocks):
print (i, stock["NAME"], stock["CODE"], (time.time()-start_time), "s")
start_time = time.time()
cursor.execute('SELECT ymd FROM ' + tableName + ' WHERE CODE=? order by ymd desc', (stock["CODE"],))
result = cursor.fetchone()
ymd = self.START_DATE
if result is not None:
ymd = result[0]
stock_data = self.crawl_specific_stock(stock["CODE"], ymd)
for item in stock_data:
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=? and ymd=?', (stock["CODE"],item['ymd'],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, ymd, close, diff, open, high, low, volume) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)", (stock["CODE"], stock["NAME"], item['ymd'], item['close'], item['diff'], item['open'], item['high'], item['low'], item['volume']))
#else:
# cursor.execute("UPDATE " + tableName + " SET close=?, diff=?, open=?, high=?, low=?, volume=? WHERE CODE=? and ymd=?", (item['close'], item['diff'], item['open'], item['high'], item['low'], item['volume'], stock["CODE"], item['ymd']))
sleep(0.5)
conn.commit()
cursor.close()
conn.close()
return
def crawl_stocks(self, inFileName):
tableName = 'stock'
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
# 테이블 생성
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text, NAME text, ymd text, close REAL, diff REAL, open REAL, high REAL, low REAL, volume REAL)")
# 키 생성
create_key = "CREATE INDEX IF NOT EXISTS " + tableName + "_idx on " + tableName + " (CODE, ymd) "
cursor.execute(create_key)
conn.commit()
cursor.close()
conn.close()
code_df = self.getStockInfo()
items = code_df.values
start_time = time.time()
idx = 0
for item in items:
idx += 1
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
item_name = item[0]
item_code = item[1]
cursor.execute('SELECT ymd FROM ' + tableName + ' WHERE CODE=? order by ymd desc', (item_code,))
result = cursor.fetchone()
stock = {"CODE": item_code, "NAME": item_name}
#ymd = (datetime.today() - timedelta(days=300)).strftime('%Y-%m-%d')
ymd = self.START_DATE.replace(".", "-")
if result is not None:
ymd = result[0]
stock_data = self.crawl_specific_stock(stock["CODE"], ymd)
if stock_data is not None:
for item in stock_data:
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=? and ymd=?', (stock["CODE"],item['ymd'],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, ymd, close, diff, open, high, low, volume) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)", (stock["CODE"], stock["NAME"], item['ymd'], item['close'], item['diff'], item['open'], item['high'], item['low'], item['volume']))
#else:
# cursor.execute("UPDATE " + tableName + " SET close=?, diff=?, open=?, high=?, low=?, volume=? WHERE CODE=? and ymd=?", (item['close'], item['diff'], item['open'], item['high'], item['low'], item['volume'], stock["CODE"], item['ymd']))
conn.commit()
cursor.close()
conn.close()
print(idx, item_name, item_code, (time.time() - start_time), "s")
start_time = time.time()
sleep(0.3)
return
def crawl_special_stocks(self, inFileName):
tableName = 'stock'
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
# 테이블 생성
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text, NAME text, ymd text, close REAL, diff REAL, open REAL, high REAL, low REAL, volume REAL)")
# 키 생성
create_key = "CREATE INDEX IF NOT EXISTS " + tableName + "_idx on " + tableName + " (CODE, ymd) "
cursor.execute(create_key)
conn.commit()
cursor.close()
conn.close()
start_time = time.time()
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_columns', None)
special_stocks = {
'^KS11': 'Kospi',
'^KQ11': 'Kosdak',
'^DJI': 'Dow Johns',
'^GSPC': 'S&P 500',
'^IXIC': 'Nasdaq',
'^NDX': 'NASDAQ 100',
'SQQQ': 'ProShares UltraPro Short QQQ',
'TQQQ': 'ProShares UltraPro QQQ',
'SCO': 'ProShares UltraShort Bloomberg Crude Oil',
'UCO': 'ProShares Ultra Bloomberg Crude Oil',
'GLL': 'ProShares UltraShort Gold',
'UGL': 'ProShares Ultra Gold',
'SOXS': 'Direxion Daily Semiconductor Bear -3X Shares',
'SOXL': 'Direxion Daily Semiconductor Bull 3X Shares',
'FNGD': 'MicroSectors™ FANG+™ Index -3X Inverse Leveraged ETN',
'FNGU': 'MicroSectors™ FANG+™ Index 3X Leveraged ETN',
'AAPL': 'Apple',
'MSFT': 'Microsoft',
'GOOG': 'Alphabet C',
'AMZN': 'Amazon.com',
'AVGO': 'Broadcom',
'NVDA': 'NVIDIA',
'UNH': 'UnitedHealth',
'TSM': 'Taiwan Semiconductor',
'JNJ': 'Johnson & Johnson (JNJ)',
'TCTZF': 'Tencent Holdings',
'V': 'Visa A',
'WMT': 'Walmart',
'XOM': 'Exxon Mobil',
'JPM': 'JPMorgan',
'MA': 'Mastercard',
'CVX': 'Chevron Corp',
'HD': 'Home Depot',
'BAC': 'Bank of America',
'KO': 'Coca-Cola',
'COST': 'Costco',
'DIS': 'Walt Disney',
'VZ': 'Verizon',
'CSCO': 'Cisco',
'ORCL': 'Oracle',
'NKE': 'Nike',
'ACN': 'Accenture',
'ADBE': 'Adobe',
'CRM': 'Salesforce.com',
'INTC': 'Intel',
'QCOM': 'Qualcomm',
'AMD': 'AMD',
'MS': 'Morgan Stanley',
'T': 'AT&T',
'HON': 'Honeywell',
'IBM': 'IBM',
'DQ': 'Daqo New Energy Corp ADR',
'EBAY': 'eBay Inc',
'NTAP': 'NetApp Inc',
'ASML': 'ASML Holding NV ADR',
'BABA': 'Alibaba Group Holdings Ltd ADR'}
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
us_sotck_data = {}
for ticker in special_stocks:
cursor.execute('SELECT ymd FROM ' + tableName + ' WHERE CODE=? order by ymd desc', (ticker,))
result = cursor.fetchone()
if result is not None:
start_day = result[0].replace(".", "-")
end_day = datetime.today().strftime('%Y-%m-%d')
else:
#start_day = pd.to_datetime('2017-01-01')
start_day = self.START_DATE.replace(".", "-")
end_day = datetime.today().strftime('%Y-%m-%d')
yfin.pdr_override()
data = pdr.get_data_yahoo(ticker, start_day, end_day, auto_adjust=True, progress=False)
if len(data) <1:
continue
data['datetime'] = data.index.strftime("%Y.%m.%d")
data.set_index('datetime', inplace=True)
us_sotck_data[ticker] = {
'close': data['Close'].to_dict(),
'open': data['Open'].to_dict(),
'high': data['High'].to_dict(),
'low': data['Low'].to_dict(),
'volume': data['Volume'].to_dict()
}
cursor.close()
conn.close()
dateList = list(us_sotck_data['^KS11']['close'])
#dateList = [temp.strftime("%Y-%m-%d") for temp in dateList]
for idx, item_code in enumerate(us_sotck_data):
stock_data = []
stock = us_sotck_data[item_code]
for i, ymd in enumerate(dateList):
if i > 0:
if ymd in stock['close'] and dateList[i-1] in stock['close']:
diff = stock['close'][ymd] - stock['close'][dateList[i-1]]
else:
continue
else:
if ymd in stock['close']:
diff = stock['close'][ymd]
else:
continue
stock_data.append({
'CODE':item_code, 'NAME':special_stocks[item_code], 'ymd': ymd,
'close': round(stock['close'][ymd], 2), 'diff': round(diff, 2), 'open': round(stock['open'][ymd], 2),
'high': round(stock['high'][ymd], 2), 'low': round(stock['low'][ymd], 2), 'volume': stock['volume'][ymd]
})
stock_data = reversed(stock_data)
conn = sqlite3.connect(inFileName)
cursor = conn.cursor()
for item in stock_data:
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=? and ymd=?', (item["CODE"],item['ymd'],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, ymd, close, diff, open, high, low, volume) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)", (item["CODE"], item["NAME"], item['ymd'], item['close'], item['diff'], item['open'], item['high'], item['low'], item['volume']))
#else:
# cursor.execute("UPDATE " + tableName + " SET close=?, diff=?, open=?, high=?, low=?, volume=? WHERE CODE=? and ymd=?", (item['close'], item['diff'], item['open'], item['high'], item['low'], item['volume'], stock["CODE"], item['ymd']))
conn.commit()
cursor.close()
conn.close()
print(idx, item_code, special_stocks[item_code], (time.time() - start_time), "s")
start_time = time.time()
sleep(0.05)
return
def get_data(self, code, start_day, end_day, tick='.KS'):
stock = []
try:
#yfin.pdr_override()
#data = pdr.get_data_yahoo(code.strip() + tick, start_day, end_day, auto_adjust=True, progress=False)
data = pdr.DataReader(code.strip(), 'naver', start=start_day, end=end_day)
if len(data) < 1:
return
data['datetime'] = data.index.strftime("%Y.%m.%d")
data.set_index('datetime', inplace=True)
for idx, row in data.iterrows():
stock.append({
"ymd": idx,
'diff': 0,
'open': row['Open'],
'close': row['Close'],
'high': row['High'],
'low': row['Low'],
'volume': row['Volume'],
})
except:
print ("error")
return stock
def crawl_specific_stock(self, code, ymd):
# 데이터 수집
start_day = (datetime.today() - timedelta(weeks=2)).strftime('%Y-%m-%d')
end_day = datetime.today().strftime('%Y-%m-%d')
stock = []
try:
stock = self.get_data(code, start_day, end_day)
except:
print (code, 'is not exist...')
return stock
def update(self, inFileName, outFileName):
"""
Full json 데이터를 db에 import 시킴
inFileName = PROJECT_HOME + '/resources/stock.json.full'
outFileName = PROJECT_HOME + '/resources/stock.db'
crawler = StockCrawler()
crawler.update(inFileName, outFileName)
:param inFileName:
:param outFileName:
:return:
"""
tableName = 'stock'
conn = sqlite3.connect(outFileName, isolation_level=None)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text PRIMARY KEY, NAME text, PRICE text, MACD text, STOCHASTIC text, ICHIMOKU text, RSI text, BOLINGERBAND text)")
idx = 0
inFp = open(inFileName, 'r')
for line in inFp.readlines():
if line:
idx += 1
stock = json.loads(line)
print(idx, stock["CODE"], stock["NAME"])
text = json.dumps(stock["PRICE"], ensure_ascii=False)
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, PRICE) VALUES(?, ?, ?)", (stock["CODE"], stock["NAME"], text))
else:
cursor.execute("UPDATE " + tableName + " SET PRICE=? WHERE CODE=?", (text, stock["CODE"]))
return
def saveIndex(self, code, inFileName, outFileName):
tableName = 'stock'
conn = sqlite3.connect(outFileName)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + " (CODE text PRIMARY KEY, NAME text, PRICE text, MACD text, STOCHASTIC text, ICHIMOKU text, RSI text, BOLINGERBAND text)")
stock = {"NAME": code, "CODE": code}
lastDay = ""
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result is not None:
stock["PRICE"] = json.loads(result[2])
lastDay = stock["PRICE"][len(stock["PRICE"]) - 1]["DATE"]
with open(inFileName, "r", encoding="utf-8") as inFp:
for line in inFp:
line = line.strip()
if line[0] == "#":
continue
arr = line.split("\t")
if arr[0] == lastDay:
break
price = {"DATE": arr[0], "close": float(arr[1]), "diff": float(arr[6].replace("%", "")), "open": float(arr[2]), "high": float(arr[3]), "low": float(arr[4]), "volume": 0}
price['avg3'] = 0
price['avg5'] = 0
price['avg7'] = 0
price['avg10'] = 0
price['avg20'] = 0
price['avg30'] = 0
price['avg60'] = 0
price['avg90'] = 0
price['avg100'] = 0
price['avg120'] = 0
price['avg150'] = 0
price['avg180'] = 0
price['avg200'] = 0
price['avg240'] = 0
stock["PRICE"].append(price)
stock["PRICE"] = sorted(stock["PRICE"], key=lambda x: x['DATE'])
# self.get_moving_avg(stock)
text = json.dumps(stock['PRICE'], ensure_ascii=False)
cursor.execute('SELECT * FROM ' + tableName + ' WHERE CODE=?', (stock["CODE"],))
result = cursor.fetchone()
if result == None:
cursor.execute("INSERT INTO " + tableName + "(CODE, NAME, PRICE, MACD, STOCHASTIC, ICHIMOKU, RSI) VALUES(?, ?, ?, ?, ?, ?, ?)", (stock["CODE"], stock["NAME"], text, "[{}]", "[{}]", "[{}]", "[{}]"))
else:
cursor.execute("UPDATE " + tableName + " SET PRICE=?, MACD=?, STOCHASTIC=?, ICHIMOKU=?, RSI=? WHERE CODE=?", (text, "[{}]", "[{}]", "[{}]", "[{}]", stock["CODE"]))
conn.commit()
cursor.close()
conn.close()
return
if __name__ == "__main__":
START_DATE = "2000.01.01"
stockCrawler = StockCrawler(START_DATE)
PROJECT_HOME = os.path.join(os.path.dirname(os.path.join(os.path.dirname(os.path.join(os.path.dirname(__file__))))))
stockFileName = PROJECT_HOME + '/resources/stock.db'
stockCrawler.crawl_special_stocks(stockFileName)