#!/usr/bin/python3

#	cve-manager : CVE management tool
#	Copyright (C) 2017-2026 Alexey Appolonov
#
#	This program is free software: you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation, either version 3 of the License, or
#	(at your option) any later version.
#
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with this program.  If not, see <http://www.gnu.org/licenses/>.

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

import argparse
import os
import re
import subprocess
from cve_manager.url import FMT_GITHUB, FMT_SOURCEFG, FMT_PYTHON, URL_SPEC, \
	ParseURL

DESCRIPTION = '''
	Process gathered info in various ways; Gathered info is a CSV file, where
	each line has the following format: "<initial_url_i>, <url_i1> <url_i2> .. 
	<url_in> | <lang_i1> <lang_i2> .. <lang_im>"'''

O_COMPL = 'complement_urls'
O_RESET = 'reset'
O_COLLECT = 'collect_new_info'
OPERATIONS = (O_COMPL, O_RESET, O_COLLECT)

P_GITHUB = 'github'
P_SOURCEFG = 'sourceforge'
P_PYPI = 'pypi'
PLATFORMS = {P_GITHUB: FMT_GITHUB,
	P_SOURCEFG: FMT_SOURCEFG,
	P_PYPI: FMT_PYTHON,
	}

GET_PROJECTS_INFO = 'get-projects-info'

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

argparser = argparse.ArgumentParser(description=DESCRIPTION)
argparser.add_argument(
	'-o', '--operation',
	metavar='OPERATION_NAME', type=str, choices=OPERATIONS, required=True,
	help=f'The operation to be performed ({", ".join(OPERATIONS)})'
	)
argparser.add_argument(
	'-f', '--file_path',
	metavar='ABS_FILE_PATH', type=str, required=True,
	help='Absolute path of a file that contains gathered info'
	)
argparser.add_argument(
	'-p', '--platforms',
	metavar='PLATFORM_NAME', type=str, nargs='+', choices=sorted(PLATFORMS),
	default = [],
	help='Names of platforms which whould be engaged in specified operation ' \
	f'({", ".join(sorted(PLATFORMS))})'
	)
argparser.add_argument(
	'--rewrite',
	action='store_true',
	help='Write complemented info back to the initial file'
	)
argparser.add_argument(
	'--local',
	action='store_true',
	help='Try to find things in the current dir, not in $PATH'
	)
args = argparser.parse_args()

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def GetCols(line):

	'''Convert a string "<a> >> <b0> <b1> .. <bn> | <c0> <c1> .. <cnm>"
	to a tuple ("<a>", "<b0> <b1> .. <bn>", "<c0> <c1> .. <cnm>")'''

	line = line.strip()
	if not line:
		return tuple()

	cols = line.split(' >> ')
	if len(cols) != 2:
		return None

	subcols = cols[1].split(' | ')
	if len(subcols) < 2:
		return None

	return cols[0], subcols[0], subcols[1]

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def GenFilePath():

	'''Genearate a path for a file that the results will be written to'''

	if args.rewrite:
		return args.file_path

	i = 0
	while True:
		file_path = f'{args.file_path}.{args.operation}{i if i > 0 else ""}'
		if not os.path.exists(file_path):
			break
		i += 1

	return file_path

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ProcessValue(val):

	'''Replace a special value indicating the absence of a significant value
	with dash'''

	val = val.strip()

	return '-' if val in ('!', "''", '""') or val.lower() == 'unknown' else val

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ProcessURLs(urls):

	'''Remove trailing slash and replace "https" scheme with "http" in each URL
	from a given string, return a list of modified URLs'''

	return [re.sub(r'^https:\/\/', 'http://', re.sub(r'\/$', '', col))
		for col in urls.split()]

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ProcessLangs(langs):

	'''Process a string that contains a listing of programming languages where
	each complex name of a language is represented as several words in brackets;
	An example of how the function works:
	input:  "C++ C GLSL (OpenGL Shading Language) Python"
	output: ["C++", "C", "GLSL", "OpenGL_Shading_Language", "Python"]'''

	res = ''
	langs = langs.lower()

	while True:
		m = re.search(r'\(([^\(\)]+)\)', langs)
		if not m:
			res += langs
			break
		complex_name = re.sub(r'\s+', "_", m.group(1))
		res += langs[:m.start()] + complex_name
		langs = langs[m.end():].strip()
		if langs.empty():
			break

	return res.split()

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def WriteToFile(txt, file_path):

	'''Write specified txt string to specified file'''

	try:
		with open(file_path, 'w') as f:
			f.write(txt)
	except OSError:
		print(f'[ERROR: Can\'t write to {file_path}]')
		exit(256)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ReadGatheredInfo():

	'''Read gathered info from the specified file; The first element
	of the returned tuple is a dict {<url>: ([<urls>], [<langs>]},
	and the second element is a list of warnings'''

	if not os.path.exists(args.file_path):
		print(f'[ERROR: File {args.file_path} does not exist]')
		exit(1)

	if not os.path.isfile(args.file_path):
		print(f'[ERROR: File {args.file_path} is not a regular file]')
		exit(1)

	res = {}
	warnings = []

	try:
		with open(args.file_path, 'r') as f:
			for i, line in enumerate(f.readlines()):
				cols = GetCols(line)
				if not cols:
					if cols == None:
						warnings.append(f'Wrong format in line {i + 1}')
					continue
				res[cols[0]] = (cols[1].split(), cols[2].split())
	except OSError:
		print(f'[ERROR: Can\'t read {args.file_path}]')
		exit(1)

	return res, warnings

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def UpdateGatheredInfo(update):

	'''Write collected data {<init_url>: (<urls>, <langs>)} to the specified
	file or rewrite the original one'''

	new_file_path = GenFilePath()
	txt = ''

	# Read
	try:
		with open(args.file_path, 'r') as f:
			for i, line in enumerate(f.readlines()):
				cols = GetCols(line)
				if not cols:
					continue
				c_info = update.get(cols[0])
				if not c_info:
					txt += line
					continue
				txt += f'{cols[0]} >> ' \
					f'{" ".join(c_info[0])} | ' \
					f'{" ".join(c_info[1])}\n'
	except OSError:
		print(f'[ERROR: Can\'t read from {args.file_path}]')
		exit(2)

	WriteToFile(txt, new_file_path)

	return new_file_path

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def Reset(platforms):

	'''Reset entries for specified platforms (make them == "<url> >> ? | ?")'''

	new_file_path = GenFilePath()
	txt = ''

	# Read
	try:
		with open(args.file_path, 'r') as f:
			for i, line in enumerate(f.readlines()):
				cols = GetCols(line)
				if not cols:
					continue
				init_url = cols[0]
				parsed_url = ParseURL(init_url)
				platform_id = parsed_url.get(URL_SPEC)
				if all([platform_id != PLATFORMS.get(platform_name)
						for platform_name in platforms]):
					txt += line
					continue
				txt += f'{cols[0]} >>  ? | ?\n'
	except OSError:
		print(f'[ERROR: Can\'t read from {args.file_path}]')
		exit(3)

	WriteToFile(txt, new_file_path)

	return new_file_path

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ComplementGatheredInfo(gathered_info):

	'''Produce a complementary dict {<url_b>: ([<url_a> ..], [..]) for records
	<url_a>, <url_b> ... | ...
	when there is another record
	<url_b>, ... | ...
	that lacks the connection with <url_a> (they should be interconnected)'''

	complement = {}

	for init_url, g_info in gathered_info.items():
		g_urls, g_langs = g_info
		for g_url in g_urls:
			if g_url == '-' or init_url.lower() == g_url.lower():
				continue
			reverse_search = gathered_info.get(g_url)
			if not reverse_search:
				continue
			r_urls, r_langs = reverse_search
			if init_url.lower() in [r_url.lower() for r_url in r_urls]:
				continue
			c_urls0 = complement.get(g_url, ([], []))[0]
			c_urls = {init_url} | set(c_urls0) | \
				{r_url for r_url in r_urls if r_url != '-'}
			complement[g_url] = (sorted(c_urls), sorted(r_langs))

	return complement

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def GetPendingUrls(gathered_info, platforms):

	'''Get initial URLs of specified platforms where some of the data is
	missing (equals to "?")'''

	pending_urls = set()

	for init_url, info in gathered_info.items():
		if all([''.join(el) not in ('?', '!') for el in info]):
			continue
		parsed_url = ParseURL(init_url)
		platform_id = parsed_url.get(URL_SPEC)
		if any([platform_id == PLATFORMS.get(platform_name)
				for platform_name in platforms]):
			pending_urls.add(init_url)

	return pending_urls

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def CollectInfo(pending_url):

	'''Run the "get-projects-info" utility for specified URLs and handle the
	results'''

	cmd = [f'{"./" if args.local else ""}{GET_PROJECTS_INFO}', '-u'] + \
		list(pending_url) + ['-l', '5']
	completed_process = subprocess.run(cmd, capture_output=True)
	if not completed_process or completed_process.returncode != 0:
		print(f'[ERROR: command "{" ".join(cmd)}" has failed, ' \
			f'error code is {completed_process.returncode}]')
		exit(4)

	res = {}

	for line in completed_process.stdout.splitlines():
		cols = GetCols(line.decode('utf-8'))
		if not cols:
			continue
		urls = ProcessURLs(ProcessValue(cols[1]))
		langs = ProcessLangs(ProcessValue(cols[2]))
		res[cols[0]] = (sorted(urls), sorted(langs))

	return res

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

if __name__ == '__main__':

	if args.operation in (O_COMPL, O_COLLECT):
		gathered_info, warnings = ReadGatheredInfo()
		for w in warnings:
			print(f'[WARNING: {w}]')
		if args.operation == O_COMPL:
			update = ComplementGatheredInfo(gathered_info)
		else:
			pending_url = GetPendingUrls(gathered_info, args.platforms)
			if not pending_url:
				print('There are no pending URLs, nothing to do')
				exit(0)
			update = CollectInfo(pending_url)
		new_file_path = UpdateGatheredInfo(update)
	elif args.operation == O_RESET:
		if not args.platforms:
			print('[ERROR: Platform names are not specified]')
			exit(3)
		new_file_path = Reset(args.platforms)

	print(f'Result is saved to "{new_file_path}"')

	exit(0)
