mirror of http://git.simp.i2p/simp/b32scanner.git
553 lines
17 KiB
Python
553 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
from bs4 import BeautifulSoup
|
|
import os, requests, configparser, re, json, threading, time, asyncio, aiohttp, logging, argparse
|
|
from datetime import datetime
|
|
|
|
requests.packages.urllib3.disable_warnings()
|
|
|
|
def string_to_list(input:str)->list:
|
|
x:list = input.split(',')
|
|
for i in range(0, len(x)):
|
|
if x[i][:1] == ' ':
|
|
x[i] = x[i][1:]
|
|
if x[i][-1:] == ' ':
|
|
x[i] = x[i][:-1]
|
|
return x
|
|
|
|
def is_valid_path(path:str)->bool:
|
|
'''does path exist'''
|
|
return os.path.exists(path)
|
|
|
|
def is_localhost(url_:str)->bool:
|
|
'''is link localhost'''
|
|
p = '(?:http.*://)?(?P<host>[^:/ ]+).?(?P<port>[0-9]*).*'
|
|
m = re.search(p,url_)
|
|
short = m.group('host')
|
|
try:
|
|
if short[:3] == '127':
|
|
localhost_ = True
|
|
else:
|
|
localhost_ = False
|
|
except Exception as e:
|
|
localhost_ = False
|
|
return localhost_
|
|
|
|
def get_router_url(url:str, short_format:bool)->str:
|
|
p = '(?:http.*://)?(?P<host>[^:/ ]+).?(?P<port>[0-9]*).*'
|
|
m = re.search(p,url)
|
|
short = m.group('host')
|
|
port = m.group('port')
|
|
if short_format:
|
|
router_url = f'{short}:{port}'
|
|
else:
|
|
router_url = f'https://{short}:{port}/netdb?l=1'
|
|
return router_url
|
|
|
|
def short_url(url:str)->str:
|
|
p = '(?:http.*://)?(?P<host>[^:/ ]+).?(?P<port>[0-9]*).*'
|
|
m = re.search(p,url)
|
|
short = m.group('host')
|
|
return short
|
|
|
|
def config_load(config_path:str, setting_filename:str)->tuple:
|
|
"""
|
|
Loads the config file.
|
|
|
|
Keyword arguments:
|
|
config_path -- the full path of the config file, should be in same directory as this script
|
|
setting_filename -- the filename of the config file
|
|
"""
|
|
config = configparser.ConfigParser()
|
|
config.read(config_path)
|
|
try:
|
|
routers = string_to_list((config['settings']['routers']))
|
|
i2p_proxies = []
|
|
for item in string_to_list((config['settings']['i2p_proxies'])):
|
|
i2p_proxies.append(f'http://{get_router_url(item, True)}')
|
|
accepted_responses = []
|
|
ar_string = string_to_list((config['settings']['accepted_responses']))
|
|
for ar in ar_string:
|
|
accepted_responses.append(int(ar))
|
|
timer_ = {
|
|
'check_leasesets_timer': int(config['settings']['check_leasesets_timer']),
|
|
'crawl_timer': int(config['settings']['crawl_timer']),
|
|
}
|
|
retry_attempts = int((config['settings']['retry_attempts']))
|
|
timeout_base = int((config['settings']['timeout_base']))
|
|
group_size = int((config['settings']['group_size']))
|
|
ar_list_converted = []
|
|
for responses in accepted_responses:
|
|
ar_list_converted.append(str(responses))
|
|
settings_ = {
|
|
'accepted_responses': ", ".join(ar_list_converted),
|
|
'i2p_proxies': len(i2p_proxies),
|
|
'check_leasesets_timer': timer_['check_leasesets_timer'],
|
|
'crawl_timer': timer_['crawl_timer'],
|
|
'retry_attempts': retry_attempts,
|
|
'timeout_base': timeout_base,
|
|
'group_size': group_size,
|
|
}
|
|
except (configparser.NoSectionError, configparser.NoOptionError, KeyError) as e:
|
|
if is_valid_path(config_path):
|
|
log.error(f"Can't read config file: {e}")
|
|
else:
|
|
log.error(f"The config file '{setting_filename}' was not found{e}")
|
|
except (ValueError) as e:
|
|
log.error(f"Cannot accept value: {e}")
|
|
return routers, i2p_proxies, accepted_responses, timer_, retry_attempts, timeout_base, group_size, settings_
|
|
|
|
def get_b32s(search_url:str, old_b32:dict)->dict:
|
|
"""
|
|
Uses BeatufiulSoup to get get b32s, returns dict
|
|
"""
|
|
b32_dict = {}
|
|
requests.packages.urllib3.disable_warnings()
|
|
try:
|
|
response = requests.get(search_url, verify=False)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
get_b32s = soup.find_all('table')
|
|
for link_ in get_b32s:
|
|
sig_type = str((link_.find('span', {'class': 'nowrap stype'}))).split('</span>')
|
|
if len(sig_type) >= 2:
|
|
sig_type = (sig_type[1]).split('</b> ')[1]
|
|
enc_key_ecies = (link_.find('span', {'title': 'ECIES_X25519'}))
|
|
if enc_key_ecies:
|
|
ecies = True
|
|
else:
|
|
ecies = False
|
|
enc_key_elg = (link_.find('span', {'title': 'ELGAMAL_2048'}))
|
|
if enc_key_elg:
|
|
elg = True
|
|
else:
|
|
elg = False
|
|
b32 = (link_.find('a', {'target': '_blank'}))
|
|
if b32 and not b32.get('class'):
|
|
b32_href = short_url(b32.get('href'))
|
|
if not is_localhost(b32_href):
|
|
if b32_href not in old_b32:
|
|
old_b32[b32_href] = {
|
|
'signature': sig_type,
|
|
'ecies': ecies,
|
|
'elgamal': elg,
|
|
'tried': False,
|
|
'retried': 0,
|
|
}
|
|
return old_b32
|
|
except Exception as e:
|
|
log.error(f"Can't reach router {e}: {e}")
|
|
return old_b32
|
|
|
|
def delete_file(item:str)->None:
|
|
'''tries to delete given file'''
|
|
try:
|
|
filename_ = (os.path.basename(item).split('/')[-1])
|
|
if os.path.exists(item):
|
|
os.remove(item)
|
|
else:
|
|
log.debug(f"Couldn't delete: {filename_}")
|
|
except Exception as e:
|
|
log.error(f"Couldn't delete {filename_}")
|
|
|
|
def generate_json(b32_dict:dict, filename:str)->None:
|
|
'''generates data.json file given a list of directories'''
|
|
json_file = (os.path.join(script_directory, filename))
|
|
try:
|
|
delete_file(json_file)
|
|
js = json.dumps(b32_dict)
|
|
fp = open(json_file, 'a')
|
|
fp.write(js)
|
|
fp.close()
|
|
except Exception as e:
|
|
print(e)
|
|
log.error("Can't write json file")
|
|
|
|
def load_json(filename:str)->dict:
|
|
'''load the track data from a file'''
|
|
json_file = (os.path.join(script_directory, filename))
|
|
b32_dict = {}
|
|
try:
|
|
with open(json_file) as json_file:
|
|
b32_dict = json.load(json_file)
|
|
except Exception as e:
|
|
log.error(f"Can't load previous json file or doesn't exist: {e}")
|
|
return b32_dict
|
|
return b32_dict
|
|
|
|
def proxy_round_robin(i2p_proxies:list, f:int)->tuple:
|
|
'''given the i2p_proxy list we increment f to round-robin requests, returing a proxy to use'''
|
|
n = len(i2p_proxies)
|
|
f_ = n-1
|
|
proxy = i2p_proxies[f]
|
|
if f == f_:
|
|
f = 0
|
|
else:
|
|
f += 1
|
|
return (proxy, f)
|
|
|
|
def ping_multi(urls:list, proxy_url:str)->dict:
|
|
global scanned, accepted_responses
|
|
scanned = 0
|
|
async def async_ping(
|
|
session: aiohttp.ClientSession,
|
|
site:str,
|
|
proxy_url:str,
|
|
number_of_sites:int,
|
|
vs_t:int,
|
|
**kwargs
|
|
) -> dict:
|
|
url = site
|
|
global scanned
|
|
headers = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0',
|
|
'Referer': url,
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-Fetch-Dest': 'iframe',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
'Sec-Fetch-User': '?1',
|
|
}
|
|
start_time = time.time()
|
|
if number_of_sites<5:
|
|
timeout = timeout_base
|
|
else:
|
|
timeout = (timeout_base + (vs_t*0.8))
|
|
timed_out = False
|
|
try:
|
|
async with session.request('GET', url=url, headers=headers, proxy=proxy_url, timeout=timeout, **kwargs):
|
|
resp = await session.request('GET', url=url, headers=headers, proxy=proxy_url, timeout=timeout, **kwargs)
|
|
# log.info(f"Requesting ({proxy_url}) -> {site}")
|
|
try:
|
|
new_dict = {}
|
|
headers = {}
|
|
for k in set((resp.headers).keys()):
|
|
new_dict[k] = (resp.headers).getall(k)
|
|
for item in new_dict:
|
|
headers[item] = new_dict[item][0]
|
|
except Exception as e:
|
|
pass
|
|
|
|
except Exception as e:
|
|
shortened_ = short_url(site)
|
|
log.error(f'TIMEOUT on {shortened_} {e}')
|
|
timed_out = True
|
|
timestamp_update = "0"
|
|
ping = 0
|
|
online_dict = {
|
|
'b32': shortened_,
|
|
'online': False,
|
|
'tried': False,
|
|
}
|
|
end_time = time.time()
|
|
response_time_ms = (end_time - start_time) * 1000
|
|
try:
|
|
if resp.status == 403:
|
|
try:
|
|
if headers['Content-Security-Policy'].casefold() == '''default-src 'none'; style-src 'self' 'unsafe-inline' http://proxy.i2p; form-action 'self' http://proxy.i2p; frame-ancestors 'self' https://127.0.0.1:7667/; img-src http://proxy.i2p data:; font-src http://proxy.i2p'''.casefold():
|
|
next_ = False
|
|
timed_out = True
|
|
online_dict = {
|
|
'b32': short_url(site),
|
|
'online': False,
|
|
'tried': False,
|
|
}
|
|
log.error(f'I2P PROXY TIMOUT on {short_url(site)}')
|
|
except Exception as e:
|
|
next_ = True
|
|
elif resp.status in accepted_responses:
|
|
next_ = True
|
|
else:
|
|
next_ = False
|
|
if next_:
|
|
timestamp_x = "{:%Y-%m-%d %H:%M:%S}".format(datetime.now())
|
|
timestamp_update = str(timestamp_x)
|
|
ping = (round(response_time_ms, 2))
|
|
online_dict = {
|
|
'b32': short_url(site),
|
|
'response': resp.status,
|
|
'headers': headers,
|
|
'ping': ping,
|
|
'first_seen': timestamp_update,
|
|
'online': True,
|
|
'tried': True,
|
|
}
|
|
else:
|
|
if not timed_out:
|
|
online_dict = {
|
|
'b32': short_url(site),
|
|
'online': False,
|
|
'tried': True,
|
|
}
|
|
else:
|
|
online_dict = {
|
|
'b32': short_url(site),
|
|
'online': False,
|
|
'tried': False,
|
|
}
|
|
|
|
except Exception as e:
|
|
if not timed_out:
|
|
online_dict = {
|
|
'b32': short_url(site),
|
|
'online': False,
|
|
'tried': True,
|
|
}
|
|
else:
|
|
online_dict = {
|
|
'b32': short_url(site),
|
|
'online': False,
|
|
'tried': False,
|
|
}
|
|
try:
|
|
scanned += 1
|
|
except Exception as e:
|
|
log.error(f'ERROR {e}')
|
|
return online_dict
|
|
|
|
async def gather_ping_general(sites:list, proxy_url:str, vs_t:int, **kwargs)->dict:
|
|
async with aiohttp.ClientSession() as session:
|
|
tasks = []
|
|
number_of_sites = len(sites)
|
|
f = 0
|
|
for s in sites:
|
|
proxy, f = proxy_round_robin(proxy_url, f)
|
|
tasks.append(async_ping(session=session, site=s, proxy_url=proxy, number_of_sites=number_of_sites, vs_t=vs_t, **kwargs))
|
|
htmls = await asyncio.gather(*tasks, return_exceptions=True)
|
|
return htmls
|
|
htmls = asyncio.run(gather_ping_general(urls, proxy_url, 1))
|
|
return htmls
|
|
|
|
def site_crawl_controller(urls_, b32_dict, sleep_, last_group, batch_):
|
|
global online_dict, i2p_proxies
|
|
start_time = time.time()
|
|
pg = ping_multi(urls_, i2p_proxies)
|
|
end_time = time.time()
|
|
response_time = (end_time - start_time)
|
|
time_to_request = (round(response_time, 2))
|
|
log.info(f'{batch_} batch of {len(urls_)} requests over {len(i2p_proxies)} proxies done in {time_to_request} seconds')
|
|
if update_status_:
|
|
update_('status', f'{batch_} batch of {len(urls_)} requests over {len(i2p_proxies)} proxies done in {time_to_request} seconds')
|
|
log.debug(f'Threads: {threading.active_count()}')
|
|
new_list = []
|
|
for item in pg:
|
|
try:
|
|
short_ = item['b32']
|
|
if item['online']:
|
|
b32_dict[short_]['tried'] = True
|
|
b32_dict[short_]['online'] = True
|
|
b32_dict[short_]['response'] = item['response']
|
|
b32_dict[short_]['headers'] = item['headers']
|
|
b32_dict[short_]['ping'] = item['ping']
|
|
b32_dict[short_]['first_seen'] = item['first_seen']
|
|
b32_dict[short_]['online'] = True
|
|
b32_dict[short_]['new'] = True
|
|
new_list.append(short_)
|
|
else:
|
|
b32_dict[short_]['tried'] = item['tried']
|
|
b32_dict[short_]['online'] = False
|
|
b32_dict[short_]['retried'] += 1
|
|
except TypeError as e:
|
|
log.error(f'TypeError {e} {item}')
|
|
if last_group:
|
|
log.info(f'Updating data.json')
|
|
generate_json(b32_dict, 'data.json')
|
|
new_online_dict = {}
|
|
online_dict = load_json('online.json')
|
|
for k, v in b32_dict.items():
|
|
try:
|
|
if v['online']:
|
|
online_dict[k] = {
|
|
'response': v['response'],
|
|
'headers': v['headers'],
|
|
'ping': v['ping'],
|
|
'first_seen': v['first_seen'],
|
|
'signature': v['signature'],
|
|
'ecies': v['ecies'],
|
|
'elgamal': v['elgamal'],
|
|
}
|
|
if k in new_list:
|
|
new_online_dict[k] = {'response': v['response'], 'ping': v['ping']}
|
|
b32_dict[k]['new'] = False
|
|
except KeyError as e:
|
|
pass
|
|
log.info(f'Updating online.json')
|
|
generate_json(online_dict, 'online.json')
|
|
if len(new_online_dict) > 0:
|
|
log.info(f'>>>Found {len(new_list)} new online sites')
|
|
for k, v in new_online_dict.items():
|
|
log.info(f"{k} | PING {v['ping']}")
|
|
log.info(f'>>>{len(new_online_dict)} total sites found, {len(b32_dict)} b32s known')
|
|
else:
|
|
log.info(f'>>>No new sites found. {len(online_dict)} http sites known, total of {len(b32_dict)} b32s known')
|
|
else:
|
|
time.sleep(sleep_)
|
|
task_time = (round((time.time() - start_time), 2))
|
|
return task_time
|
|
|
|
def check_leasesets(routers):
|
|
b32_dict = load_json('data.json')
|
|
start_time = time.time()
|
|
start_count = len(b32_dict)
|
|
for item in routers:
|
|
log.info(f'Checking router {get_router_url(item, True)}')
|
|
url_ = get_router_url(item, False)
|
|
prev_count = len(b32_dict)
|
|
b32_dict = get_b32s(url_, b32_dict)
|
|
added_b32s = len(b32_dict) - prev_count
|
|
if added_b32s != 0:
|
|
log.debug(f'{str(added_b32s)} b32s added.')
|
|
total_added_b32s = len(b32_dict) - start_count
|
|
if total_added_b32s != 0:
|
|
log.debug(f'>>>TOTAL {str(added_b32s)} b32s added.')
|
|
if update_status_:
|
|
update_('status', f'{str(added_b32s)} b32s added from {len(routers)} routers')
|
|
else:
|
|
log.debug(f'No new b32s added')
|
|
urls_ = []
|
|
for item in b32_dict:
|
|
if not b32_dict[item]['tried']:
|
|
if b32_dict[item]['retried'] <= retry_attempts:
|
|
if item[:7].casefold() != 'http://'.casefold():
|
|
urls_.append(f'http://{short_url(item)}')
|
|
else:
|
|
urls_.append(item)
|
|
# log.debug(f'Threads: {threading.active_count()}')
|
|
task_time = (round((time.time() - start_time), 2))
|
|
return urls_, b32_dict, task_time
|
|
|
|
def update_api(msg: str, txtfile:str)->None:
|
|
'''given a path for a textfile and a string, make a one line txt file'''
|
|
file_ = f"{script_directory}/{txtfile}"
|
|
file_exists = os.path.exists(file_)
|
|
new_line = f'{msg}\n'
|
|
if not file_exists:
|
|
file_object = open(file_, 'a')
|
|
file_object.write(new_line)
|
|
file_object.close()
|
|
else:
|
|
file_object = open(file_, 'w')
|
|
file_object.writelines(new_line)
|
|
file_object.close()
|
|
|
|
def random_string(size:int)->str:
|
|
'''takes an int, gives a random string of that length'''
|
|
letters = string.ascii_lowercase+string.ascii_uppercase+string.digits
|
|
return ''.join(secrets.choice(letters) for i in range(size))
|
|
|
|
def send_message(payload:dict, url_:str)->tuple:
|
|
'''given a payload and a url, does http post request'''
|
|
try:
|
|
response = requests.post(url_, json=payload)
|
|
if response.status_code == 200:
|
|
return True, 'Sent'
|
|
else:
|
|
return False, f'Failed to send message. Status code: {response.status_code}'
|
|
except Exception as e:
|
|
return False, 'Failed to send'
|
|
|
|
def update_(key_, msg_):
|
|
payload = {
|
|
'key': api_key,
|
|
key_: msg_,
|
|
'settings': settings_,
|
|
}
|
|
url_ = f'http://{hostname}:{flask_port}/status'
|
|
sent_ = send_message(payload, url_)[0]
|
|
if sent_:
|
|
log.info(f'status message sent to port {flask_port}')
|
|
else:
|
|
log.debug(f'status message failed to send to port {flask_port}')
|
|
|
|
hostname = '127.0.0.1'
|
|
flask_port = '5071'
|
|
update_status_ = True
|
|
setting_filename:str = 'script_settings.ini'
|
|
script_directory:str = os.path.dirname(os.path.abspath(__file__))
|
|
config_path:str = os.path.join(script_directory, setting_filename)
|
|
routers, i2p_proxies, accepted_responses, timer_, retry_attempts, timeout_base, group_size, settings_ = config_load(config_path, setting_filename)
|
|
requests.packages.urllib3.disable_warnings()
|
|
online_dict:dict = {}
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("-l", "--loglevel", default=logging.WARNING,
|
|
choices=logging.getLevelNamesMapping().keys(), help="Set log level")
|
|
args = parser.parse_args()
|
|
try:
|
|
level_ = args.loglevel.upper()
|
|
except Exception as e:
|
|
level_ = 'WARNING'
|
|
logging.basicConfig(filename=os.path.join(script_directory, 'b32scanner.log'), encoding='utf-8', level=level_)
|
|
log = logging.getLogger('B32SCANNER')
|
|
|
|
if __name__ == "__main__":
|
|
if update_status_:
|
|
import string, secrets
|
|
api_key = random_string(40)
|
|
update_api(api_key, '.apikey')
|
|
|
|
log_start = [
|
|
f'starting b32 scanner'.upper(),
|
|
f"checking {len(routers)} routers: {' | '.join(routers)}".upper(),
|
|
f"checking {len(routers)} routers: {' | '.join(routers)}".upper(),
|
|
f"using {len(i2p_proxies)} http tunnels: {' | '.join(i2p_proxies)}".upper(),
|
|
f"checking leasesets every {timer_['check_leasesets_timer']} seconds, crawling every {timer_['crawl_timer']} seconds, using {retry_attempts} retries".upper(),
|
|
]
|
|
|
|
for lg_ in log_start:
|
|
log.info(lg_)
|
|
first_run:bool = True
|
|
task_time_b:int = 0
|
|
while 1:
|
|
task_time:int = 0
|
|
if first_run:
|
|
start_time = time.time()
|
|
urls_, b32_dict, task_time_a = check_leasesets(routers)
|
|
task_time += task_time_a
|
|
else:
|
|
task_time += task_time_b
|
|
log.info(f'>>>Requesting {len(urls_)} sites now')
|
|
if update_status_:
|
|
update_('status', f'Requesting {len(urls_)} sites now')
|
|
groups = (int(len(urls_) / group_size)) + (int(len(urls_) % group_size > 0))
|
|
a = 0
|
|
b = group_size
|
|
for i in range(0,groups):
|
|
if b >= len(urls_):
|
|
last_group = True
|
|
else:
|
|
last_group = False
|
|
batch_ = f'[{i+1}/{groups}]'
|
|
task_time += site_crawl_controller(urls_[a:b], b32_dict, 3, last_group, batch_)
|
|
if (round((time.time() - start_time), 2)) >= timer_['check_leasesets_timer']:
|
|
urls_, b32_dict, task_time_a = check_leasesets(routers)
|
|
task_time += task_time_a
|
|
start_time = time.time()
|
|
a += group_size
|
|
b += group_size
|
|
if first_run:
|
|
first_run = False
|
|
to_sleep = (timer_['check_leasesets_timer'] - task_time)
|
|
update_('status', f'waiting {round(to_sleep, 2)} seconds')
|
|
if to_sleep >= 0:
|
|
if to_sleep > 60:
|
|
while to_sleep > 0:
|
|
if to_sleep > 60:
|
|
sleeping = 60
|
|
else:
|
|
sleeping = to_sleep
|
|
if update_status_:
|
|
update_('status', f'waiting {round(to_sleep, 2)} seconds')
|
|
time.sleep(sleeping)
|
|
to_sleep -= sleeping
|
|
if (round((time.time() - start_time), 2)) >= timer_['check_leasesets_timer']:
|
|
urls_, b32_dict, task_time_b = check_leasesets(routers)
|
|
start_time = time.time()
|
|
else:
|
|
task_time_b = 0
|
|
time.sleep(to_sleep)
|
|
|
|
|
|
|
|
|