b32scanner/scanner.py

553 lines
17 KiB
Python

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import os, requests, configparser, re, json, threading, time, asyncio, aiohttp, logging, argparse
from datetime import datetime
requests.packages.urllib3.disable_warnings()
def string_to_list(input:str)->list:
x:list = input.split(',')
for i in range(0, len(x)):
if x[i][:1] == ' ':
x[i] = x[i][1:]
if x[i][-1:] == ' ':
x[i] = x[i][:-1]
return x
def is_valid_path(path:str)->bool:
'''does path exist'''
return os.path.exists(path)
def is_localhost(url_:str)->bool:
'''is link localhost'''
p = '(?:http.*://)?(?P<host>[^:/ ]+).?(?P<port>[0-9]*).*'
m = re.search(p,url_)
short = m.group('host')
try:
if short[:3] == '127':
localhost_ = True
else:
localhost_ = False
except Exception as e:
localhost_ = False
return localhost_
def get_router_url(url:str, short_format:bool)->str:
p = '(?:http.*://)?(?P<host>[^:/ ]+).?(?P<port>[0-9]*).*'
m = re.search(p,url)
short = m.group('host')
port = m.group('port')
if short_format:
router_url = f'{short}:{port}'
else:
router_url = f'https://{short}:{port}/netdb?l=1'
return router_url
def short_url(url:str)->str:
p = '(?:http.*://)?(?P<host>[^:/ ]+).?(?P<port>[0-9]*).*'
m = re.search(p,url)
short = m.group('host')
return short
def config_load(config_path:str, setting_filename:str)->tuple:
"""
Loads the config file.
Keyword arguments:
config_path -- the full path of the config file, should be in same directory as this script
setting_filename -- the filename of the config file
"""
config = configparser.ConfigParser()
config.read(config_path)
try:
routers = string_to_list((config['settings']['routers']))
i2p_proxies = []
for item in string_to_list((config['settings']['i2p_proxies'])):
i2p_proxies.append(f'http://{get_router_url(item, True)}')
accepted_responses = []
ar_string = string_to_list((config['settings']['accepted_responses']))
for ar in ar_string:
accepted_responses.append(int(ar))
timer_ = {
'check_leasesets_timer': int(config['settings']['check_leasesets_timer']),
'crawl_timer': int(config['settings']['crawl_timer']),
}
retry_attempts = int((config['settings']['retry_attempts']))
timeout_base = int((config['settings']['timeout_base']))
group_size = int((config['settings']['group_size']))
ar_list_converted = []
for responses in accepted_responses:
ar_list_converted.append(str(responses))
settings_ = {
'accepted_responses': ", ".join(ar_list_converted),
'i2p_proxies': len(i2p_proxies),
'check_leasesets_timer': timer_['check_leasesets_timer'],
'crawl_timer': timer_['crawl_timer'],
'retry_attempts': retry_attempts,
'timeout_base': timeout_base,
'group_size': group_size,
}
except (configparser.NoSectionError, configparser.NoOptionError, KeyError) as e:
if is_valid_path(config_path):
log.error(f"Can't read config file: {e}")
else:
log.error(f"The config file '{setting_filename}' was not found{e}")
except (ValueError) as e:
log.error(f"Cannot accept value: {e}")
return routers, i2p_proxies, accepted_responses, timer_, retry_attempts, timeout_base, group_size, settings_
def get_b32s(search_url:str, old_b32:dict)->dict:
"""
Uses BeatufiulSoup to get get b32s, returns dict
"""
b32_dict = {}
requests.packages.urllib3.disable_warnings()
try:
response = requests.get(search_url, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
get_b32s = soup.find_all('table')
for link_ in get_b32s:
sig_type = str((link_.find('span', {'class': 'nowrap stype'}))).split('</span>')
if len(sig_type) >= 2:
sig_type = (sig_type[1]).split('</b> ')[1]
enc_key_ecies = (link_.find('span', {'title': 'ECIES_X25519'}))
if enc_key_ecies:
ecies = True
else:
ecies = False
enc_key_elg = (link_.find('span', {'title': 'ELGAMAL_2048'}))
if enc_key_elg:
elg = True
else:
elg = False
b32 = (link_.find('a', {'target': '_blank'}))
if b32 and not b32.get('class'):
b32_href = short_url(b32.get('href'))
if not is_localhost(b32_href):
if b32_href not in old_b32:
old_b32[b32_href] = {
'signature': sig_type,
'ecies': ecies,
'elgamal': elg,
'tried': False,
'retried': 0,
}
return old_b32
except Exception as e:
log.error(f"Can't reach router {e}: {e}")
return old_b32
def delete_file(item:str)->None:
'''tries to delete given file'''
try:
filename_ = (os.path.basename(item).split('/')[-1])
if os.path.exists(item):
os.remove(item)
else:
log.debug(f"Couldn't delete: {filename_}")
except Exception as e:
log.error(f"Couldn't delete {filename_}")
def generate_json(b32_dict:dict, filename:str)->None:
'''generates data.json file given a list of directories'''
json_file = (os.path.join(script_directory, filename))
try:
delete_file(json_file)
js = json.dumps(b32_dict)
fp = open(json_file, 'a')
fp.write(js)
fp.close()
except Exception as e:
print(e)
log.error("Can't write json file")
def load_json(filename:str)->dict:
'''load the track data from a file'''
json_file = (os.path.join(script_directory, filename))
b32_dict = {}
try:
with open(json_file) as json_file:
b32_dict = json.load(json_file)
except Exception as e:
log.error(f"Can't load previous json file or doesn't exist: {e}")
return b32_dict
return b32_dict
def proxy_round_robin(i2p_proxies:list, f:int)->tuple:
'''given the i2p_proxy list we increment f to round-robin requests, returing a proxy to use'''
n = len(i2p_proxies)
f_ = n-1
proxy = i2p_proxies[f]
if f == f_:
f = 0
else:
f += 1
return (proxy, f)
def ping_multi(urls:list, proxy_url:str)->dict:
global scanned, accepted_responses
scanned = 0
async def async_ping(
session: aiohttp.ClientSession,
site:str,
proxy_url:str,
number_of_sites:int,
vs_t:int,
**kwargs
) -> dict:
url = site
global scanned
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0',
'Referer': url,
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'iframe',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
}
start_time = time.time()
if number_of_sites<5:
timeout = timeout_base
else:
timeout = (timeout_base + (vs_t*0.8))
timed_out = False
try:
async with session.request('GET', url=url, headers=headers, proxy=proxy_url, timeout=timeout, **kwargs):
resp = await session.request('GET', url=url, headers=headers, proxy=proxy_url, timeout=timeout, **kwargs)
# log.info(f"Requesting ({proxy_url}) -> {site}")
try:
new_dict = {}
headers = {}
for k in set((resp.headers).keys()):
new_dict[k] = (resp.headers).getall(k)
for item in new_dict:
headers[item] = new_dict[item][0]
except Exception as e:
pass
except Exception as e:
shortened_ = short_url(site)
log.error(f'TIMEOUT on {shortened_} {e}')
timed_out = True
timestamp_update = "0"
ping = 0
online_dict = {
'b32': shortened_,
'online': False,
'tried': False,
}
end_time = time.time()
response_time_ms = (end_time - start_time) * 1000
try:
if resp.status == 403:
try:
if headers['Content-Security-Policy'].casefold() == '''default-src 'none'; style-src 'self' 'unsafe-inline' http://proxy.i2p; form-action 'self' http://proxy.i2p; frame-ancestors 'self' https://127.0.0.1:7667/; img-src http://proxy.i2p data:; font-src http://proxy.i2p'''.casefold():
next_ = False
timed_out = True
online_dict = {
'b32': short_url(site),
'online': False,
'tried': False,
}
log.error(f'I2P PROXY TIMOUT on {short_url(site)}')
except Exception as e:
next_ = True
elif resp.status in accepted_responses:
next_ = True
else:
next_ = False
if next_:
timestamp_x = "{:%Y-%m-%d %H:%M:%S}".format(datetime.now())
timestamp_update = str(timestamp_x)
ping = (round(response_time_ms, 2))
online_dict = {
'b32': short_url(site),
'response': resp.status,
'headers': headers,
'ping': ping,
'first_seen': timestamp_update,
'online': True,
'tried': True,
}
else:
if not timed_out:
online_dict = {
'b32': short_url(site),
'online': False,
'tried': True,
}
else:
online_dict = {
'b32': short_url(site),
'online': False,
'tried': False,
}
except Exception as e:
if not timed_out:
online_dict = {
'b32': short_url(site),
'online': False,
'tried': True,
}
else:
online_dict = {
'b32': short_url(site),
'online': False,
'tried': False,
}
try:
scanned += 1
except Exception as e:
log.error(f'ERROR {e}')
return online_dict
async def gather_ping_general(sites:list, proxy_url:str, vs_t:int, **kwargs)->dict:
async with aiohttp.ClientSession() as session:
tasks = []
number_of_sites = len(sites)
f = 0
for s in sites:
proxy, f = proxy_round_robin(proxy_url, f)
tasks.append(async_ping(session=session, site=s, proxy_url=proxy, number_of_sites=number_of_sites, vs_t=vs_t, **kwargs))
htmls = await asyncio.gather(*tasks, return_exceptions=True)
return htmls
htmls = asyncio.run(gather_ping_general(urls, proxy_url, 1))
return htmls
def site_crawl_controller(urls_, b32_dict, sleep_, last_group, batch_):
global online_dict, i2p_proxies
start_time = time.time()
pg = ping_multi(urls_, i2p_proxies)
end_time = time.time()
response_time = (end_time - start_time)
time_to_request = (round(response_time, 2))
log.info(f'{batch_} batch of {len(urls_)} requests over {len(i2p_proxies)} proxies done in {time_to_request} seconds')
if update_status_:
update_('status', f'{batch_} batch of {len(urls_)} requests over {len(i2p_proxies)} proxies done in {time_to_request} seconds')
log.debug(f'Threads: {threading.active_count()}')
new_list = []
for item in pg:
try:
short_ = item['b32']
if item['online']:
b32_dict[short_]['tried'] = True
b32_dict[short_]['online'] = True
b32_dict[short_]['response'] = item['response']
b32_dict[short_]['headers'] = item['headers']
b32_dict[short_]['ping'] = item['ping']
b32_dict[short_]['first_seen'] = item['first_seen']
b32_dict[short_]['online'] = True
b32_dict[short_]['new'] = True
new_list.append(short_)
else:
b32_dict[short_]['tried'] = item['tried']
b32_dict[short_]['online'] = False
b32_dict[short_]['retried'] += 1
except TypeError as e:
log.error(f'TypeError {e} {item}')
if last_group:
log.info(f'Updating data.json')
generate_json(b32_dict, 'data.json')
new_online_dict = {}
online_dict = load_json('online.json')
for k, v in b32_dict.items():
try:
if v['online']:
online_dict[k] = {
'response': v['response'],
'headers': v['headers'],
'ping': v['ping'],
'first_seen': v['first_seen'],
'signature': v['signature'],
'ecies': v['ecies'],
'elgamal': v['elgamal'],
}
if k in new_list:
new_online_dict[k] = {'response': v['response'], 'ping': v['ping']}
b32_dict[k]['new'] = False
except KeyError as e:
pass
log.info(f'Updating online.json')
generate_json(online_dict, 'online.json')
if len(new_online_dict) > 0:
log.info(f'>>>Found {len(new_list)} new online sites')
for k, v in new_online_dict.items():
log.info(f"{k} | PING {v['ping']}")
log.info(f'>>>{len(new_online_dict)} total sites found, {len(b32_dict)} b32s known')
else:
log.info(f'>>>No new sites found. {len(online_dict)} http sites known, total of {len(b32_dict)} b32s known')
else:
time.sleep(sleep_)
task_time = (round((time.time() - start_time), 2))
return task_time
def check_leasesets(routers):
b32_dict = load_json('data.json')
start_time = time.time()
start_count = len(b32_dict)
for item in routers:
log.info(f'Checking router {get_router_url(item, True)}')
url_ = get_router_url(item, False)
prev_count = len(b32_dict)
b32_dict = get_b32s(url_, b32_dict)
added_b32s = len(b32_dict) - prev_count
if added_b32s != 0:
log.debug(f'{str(added_b32s)} b32s added.')
total_added_b32s = len(b32_dict) - start_count
if total_added_b32s != 0:
log.debug(f'>>>TOTAL {str(added_b32s)} b32s added.')
if update_status_:
update_('status', f'{str(added_b32s)} b32s added from {len(routers)} routers')
else:
log.debug(f'No new b32s added')
urls_ = []
for item in b32_dict:
if not b32_dict[item]['tried']:
if b32_dict[item]['retried'] <= retry_attempts:
if item[:7].casefold() != 'http://'.casefold():
urls_.append(f'http://{short_url(item)}')
else:
urls_.append(item)
# log.debug(f'Threads: {threading.active_count()}')
task_time = (round((time.time() - start_time), 2))
return urls_, b32_dict, task_time
def update_api(msg: str, txtfile:str)->None:
'''given a path for a textfile and a string, make a one line txt file'''
file_ = f"{script_directory}/{txtfile}"
file_exists = os.path.exists(file_)
new_line = f'{msg}\n'
if not file_exists:
file_object = open(file_, 'a')
file_object.write(new_line)
file_object.close()
else:
file_object = open(file_, 'w')
file_object.writelines(new_line)
file_object.close()
def random_string(size:int)->str:
'''takes an int, gives a random string of that length'''
letters = string.ascii_lowercase+string.ascii_uppercase+string.digits
return ''.join(secrets.choice(letters) for i in range(size))
def send_message(payload:dict, url_:str)->tuple:
'''given a payload and a url, does http post request'''
try:
response = requests.post(url_, json=payload)
if response.status_code == 200:
return True, 'Sent'
else:
return False, f'Failed to send message. Status code: {response.status_code}'
except Exception as e:
return False, 'Failed to send'
def update_(key_, msg_):
payload = {
'key': api_key,
key_: msg_,
'settings': settings_,
}
url_ = f'http://{hostname}:{flask_port}/status'
sent_ = send_message(payload, url_)[0]
if sent_:
log.info(f'status message sent to port {flask_port}')
else:
log.debug(f'status message failed to send to port {flask_port}')
hostname = '127.0.0.1'
flask_port = '5071'
update_status_ = True
setting_filename:str = 'script_settings.ini'
script_directory:str = os.path.dirname(os.path.abspath(__file__))
config_path:str = os.path.join(script_directory, setting_filename)
routers, i2p_proxies, accepted_responses, timer_, retry_attempts, timeout_base, group_size, settings_ = config_load(config_path, setting_filename)
requests.packages.urllib3.disable_warnings()
online_dict:dict = {}
parser = argparse.ArgumentParser()
parser.add_argument("-l", "--loglevel", default=logging.WARNING,
choices=logging.getLevelNamesMapping().keys(), help="Set log level")
args = parser.parse_args()
try:
level_ = args.loglevel.upper()
except Exception as e:
level_ = 'WARNING'
logging.basicConfig(filename=os.path.join(script_directory, 'b32scanner.log'), encoding='utf-8', level=level_)
log = logging.getLogger('B32SCANNER')
if __name__ == "__main__":
if update_status_:
import string, secrets
api_key = random_string(40)
update_api(api_key, '.apikey')
log_start = [
f'starting b32 scanner'.upper(),
f"checking {len(routers)} routers: {' | '.join(routers)}".upper(),
f"checking {len(routers)} routers: {' | '.join(routers)}".upper(),
f"using {len(i2p_proxies)} http tunnels: {' | '.join(i2p_proxies)}".upper(),
f"checking leasesets every {timer_['check_leasesets_timer']} seconds, crawling every {timer_['crawl_timer']} seconds, using {retry_attempts} retries".upper(),
]
for lg_ in log_start:
log.info(lg_)
first_run:bool = True
task_time_b:int = 0
while 1:
task_time:int = 0
if first_run:
start_time = time.time()
urls_, b32_dict, task_time_a = check_leasesets(routers)
task_time += task_time_a
else:
task_time += task_time_b
log.info(f'>>>Requesting {len(urls_)} sites now')
if update_status_:
update_('status', f'Requesting {len(urls_)} sites now')
groups = (int(len(urls_) / group_size)) + (int(len(urls_) % group_size > 0))
a = 0
b = group_size
for i in range(0,groups):
if b >= len(urls_):
last_group = True
else:
last_group = False
batch_ = f'[{i+1}/{groups}]'
task_time += site_crawl_controller(urls_[a:b], b32_dict, 3, last_group, batch_)
if (round((time.time() - start_time), 2)) >= timer_['check_leasesets_timer']:
urls_, b32_dict, task_time_a = check_leasesets(routers)
task_time += task_time_a
start_time = time.time()
a += group_size
b += group_size
if first_run:
first_run = False
to_sleep = (timer_['check_leasesets_timer'] - task_time)
update_('status', f'waiting {round(to_sleep, 2)} seconds')
if to_sleep >= 0:
if to_sleep > 60:
while to_sleep > 0:
if to_sleep > 60:
sleeping = 60
else:
sleeping = to_sleep
if update_status_:
update_('status', f'waiting {round(to_sleep, 2)} seconds')
time.sleep(sleeping)
to_sleep -= sleeping
if (round((time.time() - start_time), 2)) >= timer_['check_leasesets_timer']:
urls_, b32_dict, task_time_b = check_leasesets(routers)
start_time = time.time()
else:
task_time_b = 0
time.sleep(to_sleep)