aboutsummaryrefslogtreecommitdiff
path: root/redditorsbtfo.sh
diff options
context:
space:
mode:
authorsinanmohd <sinan@firemail.cc>2023-05-05 10:18:35 +0530
committersinanmohd <sinan@firemail.cc>2023-05-05 21:42:14 +0530
commit64e317946cdb57ed1514e4c65bbe6ef2c195413e (patch)
treeff7745ca5fc184e2d0f0e8d3702ef5d58e7b4d6f /redditorsbtfo.sh
repo: initil commit
Diffstat (limited to 'redditorsbtfo.sh')
-rwxr-xr-xredditorsbtfo.sh255
1 files changed, 255 insertions, 0 deletions
diff --git a/redditorsbtfo.sh b/redditorsbtfo.sh
new file mode 100755
index 0000000..be5ecd8
--- /dev/null
+++ b/redditorsbtfo.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+
+data_dir="data"
+
+usage()
+{
+ cat <<- EOF
+ Usage: ${0##*/} [options]
+ Scrape reddit to block new nsfw domains
+
+ Options:
+ -o outfile, (default hosts)
+ -p number of pages to scrape, (default 10)
+ -d data directory, (default ./data)
+ -l subreddit list to scrape, (default v2)
+ v1 (1k subreddits)
+ v2 (8k subreddits)
+ bak (use the list in ${data_dir}/subs.bak.gz)
+
+ EOF
+}
+
+note()
+{
+ # usage note [color] [string1] [string2] ...
+ : "${2:?}"
+ color="${1:?}"
+ shift
+
+ case "$color" in
+ "green")
+ printf "\033[32;1m%b\033[0m" "$*" 1>&2
+ ;;
+ "yellow")
+ printf "\033[33;1m%b\033[0m" "$*" >&2
+ ;;
+ "red")
+ printf "\033[31;1m%b\033[0m" "$*" 1>&2
+ ;;
+ *)
+ printf "%b" "$*" 1>&2
+ ;;
+ esac
+}
+
+dep_check()
+{
+ # usage: cmd_check cmd_1 ...
+ : "${1:?}"
+
+ for dep; do
+ if ! command -v "$dep" 1>/dev/null; then
+ note "red" "err: $dep not found, please install it\n"
+ exit 127
+ fi
+ done
+
+ unset dep
+}
+
+get_curl()
+{
+ curl "$@" \
+ --silent \
+ --compressed \
+ --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0'
+}
+
+get_reddit()
+{
+ # 'Cookie:_options={"pref_gated_sr_optin": true, "pref_quarantine_optin": true}; over18=1'
+ get_curl "$1" \
+ --header 'Cookie: _options=%7B%22pref_gated_sr_optin%22%3A%20true%2C%20%22pref_quarantine_optin%22%3A%20true%7D; over18=1' \
+ --write-out "%{http_code}"
+}
+
+get_subs_v1()
+{
+ note "white" "scrapping nsfw subreddit list\n"
+ for page in \# $(seq 2 5); do
+ get_reddit "https://redditlist.com/nsfw${page}" |
+ grep -o 'reddit.com/r/[^"]*' |
+ grep -v '^reddit.com/r/Redditlist/$'
+ done | sort -u
+}
+
+get_subs_v2()
+{
+ data=
+ page=1
+
+ note "white" "scrapping nsfw subreddit list (v2)\n"
+ while true; do
+ data="$(get_curl "https://be.mysubs.cc/api/subs/?page=${page}")"
+
+ for sub in $( echo "$data" | jq -r '.results[].name'); do
+ printf "reddit.com/r/%s\n" "$sub"
+ done
+
+ if [ -n "$data" ]; then
+ page=$((page + 1))
+
+ [ "$(echo "$data" | jq -r '.next')" == "null" ] &&
+ break;
+ fi
+
+ done
+}
+
+get_subs_bak()
+{
+ zcat "${data_dir}/subs.bak.gz"
+}
+
+soutlinks()
+{
+ # usage post_otlinks "reddit.com/r/subreddit" [pages]
+ : "${1:?}"
+ pages="${2:?}"
+ page=0
+
+ note "white" "${1#*/} "
+ while [ "$page" -lt "$pages" ] ; do
+ data="$(get_reddit "https://old.${1}/?count=$((page * 25))")" &&
+ page=$((page + 1))
+
+ if [ -n "$data" ]; then
+ :
+ case "$data" in
+ *"404")
+ note "yellow" " banned subreddit"
+ break
+ ;;
+ *"451")
+ note "yellow" " banned in your country"
+ break
+ ;;
+ *"403")
+ note "red" " private subreddit"
+ break
+ ;;
+ *"429")
+ note "red" "..too many requests"
+ page=$((page - 1))
+ continue
+ ;;
+ *"200")
+ ;;
+ *)
+ note "red" "..something went wrong"
+ page=$((page - 1))
+ continue
+ ;;
+ esac
+ else
+ note "red" "..empty"
+ continue
+ fi
+
+ echo "$data" |
+ grep -Eo 'class="title may-blank outbound" [^ ]* href="https?://[^"]*' |
+ sed -E -e 's/.*href="https?:\/\///g' -e 's/\/.*//g'
+ note "green" "..${page}"
+ done
+ note "white" "\n"
+}
+
+make_hosts()
+{
+ # usage make_hosts [outfile]
+ : "${1:?}"
+
+ while read -r domain; do
+ printf "0.0.0.0 %s\n" "$domain"
+ done | tee "$1"
+}
+
+clean()
+{
+ # usage clean [outfile] [domain_list]
+ : "${1:?}"
+ : "${2:?}"
+
+ note "white" "removing duplicates and known good domains\n"
+
+ curl https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts \
+ -o "${data_dir}/pornography-hosts"
+
+ while read -r domain; do
+ ! grep -i "$domain" "${data_dir}/pornography-hosts" "${data_dir}/whitelist" > /dev/null &&
+ echo "$domain"
+ done < "$2" | make_hosts "$1"
+}
+
+
+main()
+{
+ out="hosts"
+ pages=10
+ list_fun="v2"
+ temp_fl="$(mktemp "${TMPDIR:-/tmp}/redditorsbtfo.XXX")"
+
+ dep_check curl jq grep sed
+
+ while getopts "ho:p:l:d:" f; do
+ case "$f" in
+ o)
+ out="$OPTARG"
+ ;;
+ p)
+ pages="$OPTARG"
+ ;;
+ l)
+ case "$OPTARG" in
+ "v1")
+ list_fun="v1"
+ ;;
+ "v2")
+ list_fun="v2"
+ ;;
+ "bak")
+ list_fun="bak"
+ ;;
+ *)
+ usage
+ exit 1
+ ;;
+ esac
+ ;;
+ d)
+ data_dir="$OPTARG"
+
+ mkdir -p "$OPTARG" > /dev/null 2>&1 ||
+ exit 1
+ ;;
+ h)
+ usage
+ exit
+ ;;
+ ?)
+ usage
+ exit 1
+ ;;
+ esac
+ done
+
+ for sub in $(get_subs_${list_fun}); do
+ soutlinks "$sub" "$pages"
+ done | sort -u > "$temp_fl"
+
+ clean "$out" "$temp_fl"
+ rm "$temp_fl"
+}
+
+main "$@"