Merge pull request #1005 from molangning/patch-remote-wordlist-updater-rebase

Added a remote wordlist updater (rebase)
This commit is contained in:
g0tmi1k 2024-06-11 16:40:02 +01:00 committed by GitHub
commit bcc0c2f093
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 102166 additions and 1475 deletions

View file

@ -0,0 +1,92 @@
#!/usr/bin/env python3
import os,sys,json
if not sys.argv[1]:
exit(0)
IS_WRAPPED=False
if "IS_RUNNING_UNDER_CALLER_SCRIPT" in os.environ:
IS_WRAPPED=os.environ['IS_RUNNING_UNDER_CALLER_SCRIPT']=="1"
def print_normal(msg):
if IS_WRAPPED:
return
print(msg)
def print_err(file,line_number):
if IS_WRAPPED:
print("E,%s,%s"%(file,line_number))
def print_warn(file,line_number):
if IS_WRAPPED:
print("W,%s,%s"%(file,line_number))
print_normal("[+] Remote wordlist overwrite check")
if IS_WRAPPED:
print("Remote wordlist overwrite check")
print("Files that the script catches will be overwritten next update.")
files=sys.argv[1].split(" ")
for i in files:
if not os.path.isfile(i):
print_err(i,0)
print_normal("[!] %s does not exist!"%(i))
exit(2)
overall_pass_status=True
sources = json.load(open(".bin/wordlist-updaters/sources.json"))
overwritten_paths = {
"dirs": [],
"files": []
}
for source in sources:
found_paths = []
if "output" in source.keys():
found_paths.append(source["output"])
if "additional_paths" in source.keys():
found_paths += source["additional_paths"]
for path in found_paths:
if os.path.isdir(path):
overwritten_paths["dirs"].append(path)
elif os.path.isfile(path):
overwritten_paths["files"].append(path)
for i in files:
for dir_path in overwritten_paths["dirs"]:
if i.startswith(dir_path):
print_normal(f"[!] Warning: file {i} is in a directory that will get overwritten!")
print_err(i, 0)
overall_pass_status=False
break
for file_path in overwritten_paths["files"]:
if i == file_path:
print_normal(f"[!] Warning: file {i} will get overwritten!")
print_err(i, 0)
overall_pass_status=False
break
if overall_pass_status:
print_normal("[+] All files passed overwrite checks")
exit(0)
print_normal("[!] Warning: One or more files failed to pass the overwrite checks")
if IS_WRAPPED:
exit(0)
else:
exit(2)

View file

@ -41,6 +41,7 @@ for i in os.listdir(INPUT_ROBOTS):
shutil.copytree(path,OUTPUT_ROBOTS,dirs_exist_ok=True)
print("[+] Copied all the files")
for i in [OUTPUT_ROBOTS,OUTPUT_TECHNOLOGIES]:
for root,_,file_list in os.walk(i):
for file in file_list:
@ -64,6 +65,3 @@ for i in [OUTPUT_ROBOTS,OUTPUT_TECHNOLOGIES]:
if len(contents)!=len(patch_content):
open(path,"wb").write(b"\n".join(patch_content))

View file

@ -1,12 +0,0 @@
#!/usr/bin/bash
set -e
set -o pipefail
mkdir -p .working_space
cd .working_space
git clone --depth=1 https://github.com/trickest/wordlists.git
cd ../
./.bin/trickest-patcher.py
rm -rf .working_space

View file

@ -0,0 +1,56 @@
# Wordlist updaters
## Overview
The purpose of the scripts are to update wordlists from remote sources defined in sources.json.
A github action should check every hour to see if the update conditions are met, then updates accordingly
`status.json` is not meant to be edited in a pr.
## Format
Example sources.json
```json
[
{
"name": "Jwt secrets update",
"type": "file",
"source": "https://raw.githubusercontent.com/wallarm/jwt-secrets/master/jwt.secrets.list",
"output": "Passwords/scraped-JWT-secrets.txt",
"post_run_script": "",
"frequency": "3h"
}
]
```
All fields are required unless otherwise stated.
`name` is the name of the task.
`type` can be one of the following: `file, git_dir`.
`source` specify the remote location. If type is `git_dir`, the folder at that location will be cloned using git.
`frequency` is the update frequency. The script will use the `status.json` file to know when to update. Accepted units of time are `h,H` for hours and `d,D` for days. Frequency can be specified with only days or hours, or with both of them. Hours cannot be before days. (`6h1d`)
`update_time` specifies the daily frequency in utc 24 hour syntax (0300). Only one update frequency field can be set at a time. (`frequency` or `update_time`)
`output` is the output file/dir the script will put the output in.
`post_run_script` is the script to be run after pulling the list successfully. This field is optional.
`additional_paths` is the additional paths that the workflow script should alert if there is a pull request for the file. This field is optional and won't be used for the updater, but rather the checker.
- - -
Example status.json
```json
{
"Jwt secrets update": {
"last_update" : 0
}
}
```

View file

@ -0,0 +1,22 @@
[
{
"name": "Jwt secrets update",
"type": "file",
"source": "https://raw.githubusercontent.com/wallarm/jwt-secrets/master/jwt.secrets.list",
"output": "Passwords/scraped-JWT-secrets.txt",
"post_run_script": "",
"frequency": "6h"
},
{
"name": "Trickest wordlist update",
"type": "git_dir",
"source": "https://github.com/trickest/wordlists.git",
"output": ".working_space",
"post_run_script": ".bin/trickest-patcher.py",
"update_time": "1030",
"additional_paths": [
"Discovery/Web-Content/trickest-robots-disallowed-wordlists/",
"Discovery/Web-Content/CMS/trickest-cms-wordlist/"
]
}
]

View file

@ -0,0 +1,8 @@
{
"Jwt secrets update": {
"last_update": 1712376971
},
"Trickest wordlist update": {
"last_update": 1712310048
}
}

178
.bin/wordlist-updaters/updater.py Executable file
View file

@ -0,0 +1,178 @@
#!/usr/bin/env python3
import os
import re
import json
import requests
import subprocess
from datetime import datetime, timedelta
# TODO Summary file
# TODO Advanced crontab syntax
BASE_PATH = ".bin/wordlist-updaters"
SOURCE_PATH = os.path.join(BASE_PATH, "sources.json")
STATUS_PATH = os.path.join(BASE_PATH, "status.json")
FREQUENCY_REGEX = r"^(?:([0-9]+)d|())(?:([0-9]+)h|())(?!.*?d)$"
VALID_TYPES = ["file", "git_dir"]
TIME_NOW = datetime.now()
def request_wrapper(url):
for i in range(1,4):
r = requests.get(url)
if r.status_code == 200:
# print("[+] Got %s successfully!"%(url))
break
if i == 3:
print("[!] Failed to get %s."%(url))
exit(2)
print("[!] Getting %s failed(%i/3)"%(url,i))
return r.text
# Check if the files exists
if not os.path.isfile(SOURCE_PATH):
print("[!] Sources.json is missing!")
exit(2)
if not os.path.isfile(STATUS_PATH):
print("[!] Status.json is missing!")
exit(2)
SOURCES = json.load(open(SOURCE_PATH, "r"))
STATUS = json.load(open(STATUS_PATH, "r"))
to_check = []
for source in SOURCES:
task_name = source["name"]
source_keys = source.keys()
if not task_name in STATUS.keys():
print(f"[+] Queuing task {task_name} as task was never checked before")
to_check.append(source)
continue
if not "output" in source_keys or not isinstance(source["output"], str):
print(f"[!] Skipping task {task_name} as output field is missing/invalid")
continue
if not "type" in source_keys or not isinstance(source["type"], str):
print(f"[!] Skipping task {task_name} as type field is missing/invalid")
continue
if not source["type"] in VALID_TYPES:
print(f"[!] Skipping task {task_name} as type is invalid")
continue
if source["output"].startswith("/"):
print(f"[!] Skipping task {task_name} as output path is not relative.")
continue
if source["type"].startswith("git_") and not source["source"].endswith(".git"):
print(f"[!] Skipping task {task_name} as a git task was defined with a non git url.")
continue
if not "last_update" in STATUS[task_name].keys() or not isinstance(STATUS[task_name]["last_update"], int):
print(f"[!] Queuing task {task_name} as last_update field is missing/invalid")
to_check.append(source)
continue
if not ("frequency" in source_keys) ^ ("update_time" in source_keys):
print(f"[!] Skipping task {task_name} as only frequency or update_time can be specified")
continue
if "frequency" in source_keys and isinstance(source["frequency"], str):
regex_match = re.search(FREQUENCY_REGEX, source["frequency"])
if not regex_match:
print(f"[!] Skipping task {task_name} as frequency field contains invalid formatting of days and hours")
continue
days, _, hours, _ = regex_match.groups()
days = bool(days) | 0
hours = bool(hours) | 0
next_update_time = datetime.fromtimestamp(STATUS[task_name]["last_update"]) + timedelta(days=days, hours=hours)
time_from_update = TIME_NOW - next_update_time
time_to_update = next_update_time - TIME_NOW
if TIME_NOW < next_update_time:
if time_to_update.seconds <= 300:
print(f"[+] Queuing task {task_name} as it is less than 5 minutes to update. ({time_to_update.seconds} seconds to update)")
to_check.append(source)
continue
print(f"[!] Skipping task {task_name} as it is more than 5 minutes to update ({time_to_update.seconds} seconds to update)")
continue
print(f"[+] Queuing task {task_name} as it is {time_to_update.seconds} seconds after scheduled update time.")
to_check.append(source)
elif "update_time" in source_keys and isinstance(source["update_time"], str):
update_time = source["update_time"]
if len(update_time) != 4 and update_time.isnumeric():
print(f"[!] Skipping task {task_name} as it is in a incorrect format")
continue
hours = int(update_time[:2])
minutes = int(update_time[2:])
if not hours in range(1, 25):
print(f"[!] Skipping task {task_name} as hours is not in range 1-24.")
continue
if not minutes in range(1, 61):
print(f"[!] Skipping task {task_name} as minutes is not in range 1-60.")
continue
scheduled_update_time = TIME_NOW.replace(hour=hours, minute=minutes)
if TIME_NOW <= scheduled_update_time and TIME_NOW + timedelta(hours=1) >= scheduled_update_time:
print(f"[+] Queuing task {task_name} as update time is within the next hour")
to_check.append(source)
continue
else:
print(f"[!] Skipping task {task_name} as update_time field is invalid")
continue
if len(to_check) == 0:
print(f"[!] No task were queued. Exiting.")
exit()
print(f"[+] Queued a total of {len(to_check)} tasks to run.")
for task in to_check:
print(f"[+] Starting task {task['name']}")
if not task["name"] in STATUS.keys():
STATUS[task["name"]] = {}
task_type = task["type"]
if task_type == "file":
content = request_wrapper(task["source"])
open(task["output"], "w").write(content)
print(f"[+] Saved file to output location")
STATUS[task["name"]]["last_update"] = int(datetime.now().timestamp())
elif task_type == "git_dir":
if not os.path.exists(task['output']):
print(f"[+] Making directory {task['output']}")
os.makedirs(task["output"])
subprocess.run(["git", "clone", "-q", "--depth=1", task["source"]], cwd=task["output"])
STATUS[task["name"]]["last_update"] = int(datetime.now().timestamp())
if task["post_run_script"]:
print("[+] Running post run script")
subprocess.run(task["post_run_script"])
print("[+] Finished running post run script")
print(f"[+] Finished task {task['name']}")
json.dump(STATUS, open(STATUS_PATH, "w"), indent=4)

View file

@ -1,8 +1,8 @@
name: Wordlist Updater - Trickest wordlists updater
name: Wordlist Updater - Remote wordlists updater
on:
schedule:
- cron: 0 0 * * *
- cron: 0 * * * *
workflow_dispatch:
@ -13,7 +13,7 @@ jobs:
- uses: actions/checkout@v3
- name: Update lists
run: ./.bin/trickest-updater.sh
run: ./.bin/wordlist-updaters/updater.py
- name: Commit files if changed
run: |
@ -23,7 +23,7 @@ jobs:
echo "[+] No files were changed"
else
echo "[+] Files were changed! Pushing changed..."
git add -A
git add --renormalize -A && git add -A
git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY
git config --local user.email "example@github.com"
git config --local user.name "GitHub Action"

File diff suppressed because it is too large Load diff