#!/usr/bin/python3 # Open a pull req if any sources are missing/need to be added from bs4 import BeautifulSoup import requests import time import re MICROSOFT_URL="https://support.microsoft.com/en-us/windows/common-file-name-extensions-in-windows-da4a4430-8e76-89c5-59f7-1cdbbc75cb01" WIKI_ROOT="https://en.wikipedia.org" WIKI_URL="https://en.wikipedia.org/wiki/List_of_filename_extensions" GFG_URL="https://www.geeksforgeeks.org/list-of-file-formats/" wiki_urls=[] exts=[] tables=[] res=requests.get(WIKI_URL).text wiki_soup=BeautifulSoup(res,"html.parser") for i in wiki_soup.findAll('div',{"class":"mw-content-ltr mw-parser-output"})[0].findAll("ul")[2].findAll("li"): wiki_urls.append(WIKI_ROOT+i.a.get('href')) for i in wiki_urls: res=requests.get(i).text wiki_soup=BeautifulSoup(res,"html.parser") tables+=wiki_soup.findAll("table") time.sleep(0.5) for i in tables: if "wikitable" not in i["class"]: continue for ext in i.tbody.findAll("tr"): ext=ext.findAll('td') if ext==[]: continue ext=re.sub(r"\[.*?\]","",ext[0].text).strip() if "," in ext: for j in ext.split(","): if "." in j: continue exts.append(j.strip()) continue if "." in ext: continue exts.append(ext) res=requests.get(MICROSOFT_URL).text microsoft_soup=BeautifulSoup(res,"html.parser") microsoft_exts=microsoft_soup.findAll("tbody")[1].findAll('p')[::2] for i in microsoft_exts: i=i.text if "," in i: i=i.split(",") for j in i: exts.append(j.strip()) continue exts.append(i) res=requests.get(GFG_URL).text gfg_soup=BeautifulSoup(res,"html.parser") gfg_exts=gfg_soup.findAll("tbody") for i in gfg_exts: i=i.findAll('th') for ext in i: ext=ext.text.strip() if ext.startswith('.'): exts.append(ext[1:].upper()) else: exts.append(ext.upper()) cleaned_exts=[] for i in exts: # https://stackoverflow.com/questions/3627784/case-insensitive-in if i.upper() in (cleaned_ext.upper() for cleaned_ext in cleaned_exts): continue cleaned_exts.append(i) exts=cleaned_exts exts=list(dict.fromkeys(exts)) exts.sort() open("../Fuzzing/file-extensions.txt","w").write("\n".join(exts)) mutated_exts=[] for i in exts: mutated_exts.append(i) mutated_exts.append(i.upper()) mutated_exts.append(i.lower()) mutated_exts=list(dict.fromkeys(mutated_exts)) mutated_exts.sort() open("../Fuzzing/file-extensions-all-cases.txt","w").write("\n".join(mutated_exts)) mutated_exts=[] for i in exts: mutated_exts.append(i.lower()) mutated_exts=list(dict.fromkeys(mutated_exts)) mutated_exts.sort() open("../Fuzzing/file-extensions-lower-case.txt","w").write("\n".join(mutated_exts)) mutated_exts=[] for i in exts: mutated_exts.append(i.upper()) mutated_exts=list(dict.fromkeys(mutated_exts)) mutated_exts.sort() open("../Fuzzing/file-extensions-upper-case.txt","w").write("\n".join(mutated_exts))