diff options
author | sinanmohd <sinan@firemail.cc> | 2023-05-05 10:18:35 +0530 |
---|---|---|
committer | sinanmohd <sinan@firemail.cc> | 2023-05-05 21:42:14 +0530 |
commit | 64e317946cdb57ed1514e4c65bbe6ef2c195413e (patch) | |
tree | ff7745ca5fc184e2d0f0e8d3702ef5d58e7b4d6f /redditorsbtfo.sh |
repo: initil commit
Diffstat (limited to 'redditorsbtfo.sh')
-rwxr-xr-x | redditorsbtfo.sh | 255 |
1 files changed, 255 insertions, 0 deletions
diff --git a/redditorsbtfo.sh b/redditorsbtfo.sh new file mode 100755 index 0000000..be5ecd8 --- /dev/null +++ b/redditorsbtfo.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +data_dir="data" + +usage() +{ + cat <<- EOF + Usage: ${0##*/} [options] + Scrape reddit to block new nsfw domains + + Options: + -o outfile, (default hosts) + -p number of pages to scrape, (default 10) + -d data directory, (default ./data) + -l subreddit list to scrape, (default v2) + v1 (1k subreddits) + v2 (8k subreddits) + bak (use the list in ${data_dir}/subs.bak.gz) + + EOF +} + +note() +{ + # usage note [color] [string1] [string2] ... + : "${2:?}" + color="${1:?}" + shift + + case "$color" in + "green") + printf "\033[32;1m%b\033[0m" "$*" 1>&2 + ;; + "yellow") + printf "\033[33;1m%b\033[0m" "$*" >&2 + ;; + "red") + printf "\033[31;1m%b\033[0m" "$*" 1>&2 + ;; + *) + printf "%b" "$*" 1>&2 + ;; + esac +} + +dep_check() +{ + # usage: cmd_check cmd_1 ... + : "${1:?}" + + for dep; do + if ! command -v "$dep" 1>/dev/null; then + note "red" "err: $dep not found, please install it\n" + exit 127 + fi + done + + unset dep +} + +get_curl() +{ + curl "$@" \ + --silent \ + --compressed \ + --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0' +} + +get_reddit() +{ + # 'Cookie:_options={"pref_gated_sr_optin": true, "pref_quarantine_optin": true}; over18=1' + get_curl "$1" \ + --header 'Cookie: _options=%7B%22pref_gated_sr_optin%22%3A%20true%2C%20%22pref_quarantine_optin%22%3A%20true%7D; over18=1' \ + --write-out "%{http_code}" +} + +get_subs_v1() +{ + note "white" "scrapping nsfw subreddit list\n" + for page in \# $(seq 2 5); do + get_reddit "https://redditlist.com/nsfw${page}" | + grep -o 'reddit.com/r/[^"]*' | + grep -v '^reddit.com/r/Redditlist/$' + done | sort -u +} + +get_subs_v2() +{ + data= + page=1 + + note "white" "scrapping nsfw subreddit list (v2)\n" + while true; do + data="$(get_curl "https://be.mysubs.cc/api/subs/?page=${page}")" + + for sub in $( echo "$data" | jq -r '.results[].name'); do + printf "reddit.com/r/%s\n" "$sub" + done + + if [ -n "$data" ]; then + page=$((page + 1)) + + [ "$(echo "$data" | jq -r '.next')" == "null" ] && + break; + fi + + done +} + +get_subs_bak() +{ + zcat "${data_dir}/subs.bak.gz" +} + +soutlinks() +{ + # usage post_otlinks "reddit.com/r/subreddit" [pages] + : "${1:?}" + pages="${2:?}" + page=0 + + note "white" "${1#*/} " + while [ "$page" -lt "$pages" ] ; do + data="$(get_reddit "https://old.${1}/?count=$((page * 25))")" && + page=$((page + 1)) + + if [ -n "$data" ]; then + : + case "$data" in + *"404") + note "yellow" " banned subreddit" + break + ;; + *"451") + note "yellow" " banned in your country" + break + ;; + *"403") + note "red" " private subreddit" + break + ;; + *"429") + note "red" "..too many requests" + page=$((page - 1)) + continue + ;; + *"200") + ;; + *) + note "red" "..something went wrong" + page=$((page - 1)) + continue + ;; + esac + else + note "red" "..empty" + continue + fi + + echo "$data" | + grep -Eo 'class="title may-blank outbound" [^ ]* href="https?://[^"]*' | + sed -E -e 's/.*href="https?:\/\///g' -e 's/\/.*//g' + note "green" "..${page}" + done + note "white" "\n" +} + +make_hosts() +{ + # usage make_hosts [outfile] + : "${1:?}" + + while read -r domain; do + printf "0.0.0.0 %s\n" "$domain" + done | tee "$1" +} + +clean() +{ + # usage clean [outfile] [domain_list] + : "${1:?}" + : "${2:?}" + + note "white" "removing duplicates and known good domains\n" + + curl https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts \ + -o "${data_dir}/pornography-hosts" + + while read -r domain; do + ! grep -i "$domain" "${data_dir}/pornography-hosts" "${data_dir}/whitelist" > /dev/null && + echo "$domain" + done < "$2" | make_hosts "$1" +} + + +main() +{ + out="hosts" + pages=10 + list_fun="v2" + temp_fl="$(mktemp "${TMPDIR:-/tmp}/redditorsbtfo.XXX")" + + dep_check curl jq grep sed + + while getopts "ho:p:l:d:" f; do + case "$f" in + o) + out="$OPTARG" + ;; + p) + pages="$OPTARG" + ;; + l) + case "$OPTARG" in + "v1") + list_fun="v1" + ;; + "v2") + list_fun="v2" + ;; + "bak") + list_fun="bak" + ;; + *) + usage + exit 1 + ;; + esac + ;; + d) + data_dir="$OPTARG" + + mkdir -p "$OPTARG" > /dev/null 2>&1 || + exit 1 + ;; + h) + usage + exit + ;; + ?) + usage + exit 1 + ;; + esac + done + + for sub in $(get_subs_${list_fun}); do + soutlinks "$sub" "$pages" + done | sort -u > "$temp_fl" + + clean "$out" "$temp_fl" + rm "$temp_fl" +} + +main "$@" |