diff --git a/stockpredictor/crawler/sQLite/MetaCrawler.py b/stockpredictor/crawler/sQLite/MetaCrawler.py index a0622c1..4f8dd09 100644 --- a/stockpredictor/crawler/sQLite/MetaCrawler.py +++ b/stockpredictor/crawler/sQLite/MetaCrawler.py @@ -57,6 +57,8 @@ class MetaCrawler: break except: print(input['URL'] + '&page=%s' % i) + if i>200: + break continue # 마지막 페이지 까지 받기 @@ -141,6 +143,8 @@ class MetaCrawler: break except: print(url + str(i)) + if i > 200: + break continue # 마지막 페이지 까지 받기 @@ -219,6 +223,8 @@ class MetaCrawler: break except: print(url + str(i)) + if i > 200: + break continue # 마지막 페이지 까지 받기 @@ -306,6 +312,8 @@ class MetaCrawler: break except: print (input['URL'] + '&page=%s' % i) + if i>200: + break continue ymd, close, diff, rate = "", 0.0, 0.0, 0.0 @@ -388,6 +396,8 @@ class MetaCrawler: break except: print(input['URL'] + '&page=%s' % i) + if i>200: + break continue # 마지막 페이지 까지 받기 diff --git a/stockpredictor/crawler/sQLite/StockCrawler.py b/stockpredictor/crawler/sQLite/StockCrawler.py index 4d11512..5c1456b 100644 --- a/stockpredictor/crawler/sQLite/StockCrawler.py +++ b/stockpredictor/crawler/sQLite/StockCrawler.py @@ -387,6 +387,8 @@ class StockCrawler: break except: print(pg_url) + if page > 200: + break continue for date in html[0].날짜.values: diff --git a/stockpredictor/crawler/toJsonFile/MetaCrawler.py b/stockpredictor/crawler/toJsonFile/MetaCrawler.py index a7e4846..a660723 100644 --- a/stockpredictor/crawler/toJsonFile/MetaCrawler.py +++ b/stockpredictor/crawler/toJsonFile/MetaCrawler.py @@ -2,6 +2,7 @@ import json import datetime import requests import pandas as pd +from time import sleep import os class MetaCrawler: @@ -46,8 +47,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) - html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(i) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: @@ -120,8 +130,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0) - html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(i) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 2: @@ -181,8 +200,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0, encoding='euc-kr') - html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text, encoding='euc-kr') - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text, encoding='euc-kr') + sleep(0.5) + break + except: + print(url + str(i)) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 10: @@ -253,8 +281,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) - html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(i) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: diff --git a/stockpredictor/crawler/toJsonFile/StockCrawler.py b/stockpredictor/crawler/toJsonFile/StockCrawler.py index ebf59f2..c130a77 100644 --- a/stockpredictor/crawler/toJsonFile/StockCrawler.py +++ b/stockpredictor/crawler/toJsonFile/StockCrawler.py @@ -5,6 +5,7 @@ import re import json import os import requests +from time import sleep class Queue(object): def __init__(self, max): @@ -255,8 +256,17 @@ class StockCrawler: # 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다. pg_url = '{url}&page={page}'.format(url=url, page=page) #html = pd.read_html(pg_url, header=0) - html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(pg_url) + if page > 200: + break + continue count = 0 for date in html[0].날짜.values: @@ -379,8 +389,17 @@ class StockCrawler: # 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다. pg_url = '{url}&page={page}'.format(url=url, page=page) #html = pd.read_html(pg_url, header=0) - html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(pg_url) + if page > 200: + break + continue count = 0 for date in html[0].날짜.values: @@ -603,8 +622,17 @@ class StockCrawler: # 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다. pg_url = '{url}&page={page}'.format(url=url, page=page) #html = pd.read_html(pg_url, header=0) - html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(pg_url) + if page > 200: + break + continue count = 0 for date in html[0].날짜.values: diff --git a/stockpredictor/crawler/toSQLite/MetaCrawler.py b/stockpredictor/crawler/toSQLite/MetaCrawler.py index 7494935..bb0f0d6 100644 --- a/stockpredictor/crawler/toSQLite/MetaCrawler.py +++ b/stockpredictor/crawler/toSQLite/MetaCrawler.py @@ -2,7 +2,7 @@ import json import datetime import requests import sqlite3 - +from time import sleep import pandas as pd import os @@ -46,8 +46,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) - html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(input['URL'] + '&page=%s' % i) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: @@ -126,8 +135,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0) - html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(url + str(i)) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 2: @@ -192,8 +210,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(url + str(i), header=0, encoding='euc-kr') - html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text, encoding='euc-kr') - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text, encoding='euc-kr') + sleep(0.5) + break + except: + print(url + str(i)) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 10: @@ -267,8 +294,17 @@ class MetaCrawler: finish = False for i in range(1, self.limit_page_count): #html = pd.read_html(input['URL'] + '&page=%s' % i, header=0) - html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(input['URL'] + '&page=%s' % i) + if i > 200: + break + continue # 마지막 페이지 까지 받기 if len(html[0].날짜.values) <= 1: diff --git a/stockpredictor/crawler/toSQLite/StockCrawler.py b/stockpredictor/crawler/toSQLite/StockCrawler.py index 229c9b8..e63a685 100644 --- a/stockpredictor/crawler/toSQLite/StockCrawler.py +++ b/stockpredictor/crawler/toSQLite/StockCrawler.py @@ -7,6 +7,7 @@ import sqlite3 import requests import math import time +from time import sleep class Queue(object): def __init__(self, max): @@ -228,8 +229,17 @@ class StockCrawler: # 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다. pg_url = '{url}&page={page}'.format(url=url, page=page) #html = pd.read_html(pg_url, header=0) - html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) - sleep(0.5) + html = None + while True: + try: + html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text) + sleep(0.5) + break + except: + print(pg_url) + if page > 200: + break + continue for date in html[0].날짜.values: if type(date) is str: