#!/bin/bash data_dir="data" usage() { cat <<- EOF Usage: ${0##*/} [options] Scrape reddit to block new nsfw domains Options: -o outfile, (default hosts) -p number of pages to scrape, (default 10) -d data directory, (default ./data) -l subreddit list to scrape, (default v2) v1 (1k subreddits) v2 (8k subreddits) bak (use the list in ${data_dir}/subs.bak.gz) EOF } note() { # usage note [color] [string1] [string2] ... : "${2:?}" color="${1:?}" shift case "$color" in "green") printf "\033[32;1m%b\033[0m" "$*" 1>&2 ;; "yellow") printf "\033[33;1m%b\033[0m" "$*" >&2 ;; "red") printf "\033[31;1m%b\033[0m" "$*" 1>&2 ;; *) printf "%b" "$*" 1>&2 ;; esac } dep_check() { # usage: cmd_check cmd_1 ... : "${1:?}" for dep; do if ! command -v "$dep" 1>/dev/null; then note "red" "err: $dep not found, please install it\n" exit 127 fi done unset dep } get_curl() { curl "$@" \ --silent \ --compressed \ --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0' } get_reddit() { # 'Cookie:_options={"pref_gated_sr_optin": true, "pref_quarantine_optin": true}; over18=1' get_curl "$1" \ --header 'Cookie: _options=%7B%22pref_gated_sr_optin%22%3A%20true%2C%20%22pref_quarantine_optin%22%3A%20true%7D; over18=1' \ --write-out "%{http_code}" } get_subs_v1() { note "white" "scrapping nsfw subreddit list\n" for page in \# $(seq 2 5); do get_reddit "https://redditlist.com/nsfw${page}" | grep -o 'reddit.com/r/[^"]*' | grep -v '^reddit.com/r/Redditlist/$' done | sort -u } get_subs_v2() { data= page=1 note "white" "scrapping nsfw subreddit list (v2)\n" while true; do data="$(get_curl "https://be.mysubs.cc/api/subs/?page=${page}")" for sub in $( echo "$data" | jq -r '.results[].name'); do printf "reddit.com/r/%s\n" "$sub" done if [ -n "$data" ]; then page=$((page + 1)) [ "$(echo "$data" | jq -r '.next')" == "null" ] && break; fi done } get_subs_bak() { zcat "${data_dir}/subs.bak.gz" } soutlinks() { # usage post_otlinks "reddit.com/r/subreddit" [pages] : "${1:?}" pages="${2:?}" page=0 note "white" "${1#*/} " while [ "$page" -lt "$pages" ] ; do data="$(get_reddit "https://old.${1}/?count=$((page * 25))")" && page=$((page + 1)) if [ -n "$data" ]; then : case "$data" in *"404") note "yellow" " banned subreddit" break ;; *"451") note "yellow" " banned in your country" break ;; *"403") note "red" " private subreddit" break ;; *"429") note "red" "..too many requests" page=$((page - 1)) continue ;; *"200") ;; *) note "red" "..something went wrong" page=$((page - 1)) continue ;; esac else note "red" "..empty" continue fi echo "$data" | grep -Eo 'class="title may-blank outbound" [^ ]* href="https?://[^"]*' | sed -E -e 's/.*href="https?:\/\///g' -e 's/\/.*//g' note "green" "..${page}" done note "white" "\n" } make_hosts() { # usage make_hosts [outfile] : "${1:?}" while read -r domain; do printf "0.0.0.0 %s\n" "$domain" done | tee "$1" } clean() { # usage clean [outfile] [domain_list] : "${1:?}" : "${2:?}" note "white" "removing duplicates and known good domains\n" curl https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts \ -o "${data_dir}/pornography-hosts" while read -r domain; do ! grep -i "$domain" "${data_dir}/pornography-hosts" "${data_dir}/whitelist" > /dev/null && echo "$domain" done < "$2" | make_hosts "$1" } main() { out="hosts" pages=10 list_fun="v2" temp_fl="$(mktemp "${TMPDIR:-/tmp}/redditorsbtfo.XXX")" dep_check curl jq grep sed while getopts "ho:p:l:d:" f; do case "$f" in o) out="$OPTARG" ;; p) pages="$OPTARG" ;; l) case "$OPTARG" in "v1") list_fun="v1" ;; "v2") list_fun="v2" ;; "bak") list_fun="bak" ;; *) usage exit 1 ;; esac ;; d) data_dir="$OPTARG" mkdir -p "$OPTARG" > /dev/null 2>&1 || exit 1 ;; h) usage exit ;; ?) usage exit 1 ;; esac done for sub in $(get_subs_${list_fun}); do soutlinks "$sub" "$pages" done | sort -u > "$temp_fl" clean "$out" "$temp_fl" rm "$temp_fl" } main "$@"