path: root/redditorsbtfo.sh

         

#!/bin/sh

data_dir="data"

usage()
{
	cat <<- EOF
	Usage: ${0##*/} [options]
	Scrape reddit to block new nsfw domains

	Options:
	  -o    outfile, (default hosts)
	  -p    number of pages to scrape, (default 10)
	  -d    data directory, (default ./data)
	  -l    subreddit list to scrape, (default v2)
	        v1 (1k subreddits)
	        v2 (8k subreddits)
	        bak (use the list in ${data_dir}/subs.bak.gz)

	EOF
}

note()
{
	# usage note [color] [string1] [string2] ...
	: "${2:?}"
	color="${1:?}"
	shift

	case "$color" in
	"green")
		printf "\033[32;1m%b\033[0m" "$*" 1>&2
		;;
	"yellow")
		printf "\033[33;1m%b\033[0m" "$*" >&2
		;;
	"red")
		printf "\033[31;1m%b\033[0m" "$*" 1>&2
		;;
	*)
		printf "%b" "$*" 1>&2
		;;
	esac
}

dep_check()
{
	# usage: cmd_check cmd_1 ...
	: "${1:?}"

	for dep; do
		if ! command -v "$dep" 1>/dev/null; then
			note "red" "err: $dep not found, please install it\n"
			exit 127
		fi
	done

	unset dep
}

get_curl()
{
	curl "$@" \
		--silent \
		--compressed \
		--header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0'
}

get_reddit()
{
	# 'Cookie:_options={"pref_gated_sr_optin": true, "pref_quarantine_optin": true}; over18=1'
	get_curl "$1" \
		--header 'Cookie: _options=%7B%22pref_gated_sr_optin%22%3A%20true%2C%20%22pref_quarantine_optin%22%3A%20true%7D; over18=1' \
		--write-out "%{http_code}"
}

get_subs_v1()
{
	note "white" "scrapping nsfw subreddit list\n"
	for page in \# $(seq 2 5); do
		get_reddit "https://redditlist.com/nsfw${page}.html" |
			grep -o 'reddit.com/r/[^"]*' |
			grep -v  '^reddit.com/r/Redditlist/$'
	done | sort -u
}

get_subs_v2()
{
	data=
	page=1

	note "white" "scrapping nsfw subreddit list (v2)\n"
	while true; do
		data="$(get_curl "https://be.mysubs.cc/api/subs/?page=${page}")"

		for sub in $( echo "$data" | jq -r '.results[].name'); do
			printf "reddit.com/r/%s\n" "$sub"
		done

		if [ -n "$data" ]; then
			page=$((page + 1))

			[ "$(echo "$data" | jq  -r '.next')" = "null" ] &&
				break;
		fi

	done
}

get_subs_bak()
{
	zcat "${data_dir}/subs.bak.gz"
}

soutlinks()
{
	# usage post_otlinks "reddit.com/r/subreddit" [pages]
	: "${1:?}"
	pages="${2:?}"
	page=0

	note "white" "${1#*/} "
	while [ "$page" -lt "$pages" ] ; do
		data="$(get_reddit "https://old.${1}/?count=$((page * 25))")" &&
			page=$((page + 1))

		if [ -n "$data" ]; then
			:
			case "$data" in
			*"404")
				note "yellow" " banned subreddit"
				break
				;;
			*"451")
			 	note "yellow" " banned in your country"
				break
				;;
			*"403")
				note "red" " private subreddit"
				break
				;;
			*"429")
			 	note "red" "..too many requests"
				page=$((page - 1))
				continue
				;;
			*"200")
				;;
			*)
			 	note "red" "..something went wrong"
				page=$((page - 1))
				continue
				;;
			esac
		else
			note "red" "..empty"
			continue
		fi

		echo "$data" |
			grep -Eo 'class="title may-blank outbound" [^ ]* href="https?://[^"]*' |
			sed -E -e 's/.*href="https?:\/\///g' -e 's/\/.*//g'
		note "green" "..${page}"
	done
	note "white" "\n"
}

make_hosts()
{
	# usage make_hosts [outfile]
	: "${1:?}"

	while read -r domain; do
		printf "0.0.0.0 %s\n" "$domain"
	done | tee "$1"
}

clean()
{
	# usage clean [outfile] [domain_list]
	: "${1:?}"
	: "${2:?}"

	note "white" "removing duplicates and known good domains\n"

	curl https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts \
		-o "${data_dir}/pornography-hosts"

	while read -r domain; do
		! grep -i "$domain" "${data_dir}/pornography-hosts" "${data_dir}/whitelist" > /dev/null &&
			echo "$domain"
	done < "$2" | make_hosts "$1"
}


main()
{
	out="hosts"
	pages=10
	list_fun="v2"
	temp_fl="$(mktemp "${TMPDIR:-/tmp}/redditorsbtfo.XXX")"

	dep_check curl jq grep sed

	while getopts "ho:p:l:d:" f; do
		case "$f" in
		o)
			out="$OPTARG"
			;;
		p)
			pages="$OPTARG"
			;;
		l)
			case "$OPTARG" in
			"v1")
				list_fun="v1"
				;;
			"v2")
				list_fun="v2"
				;;
			"bak")
				list_fun="bak"
				;;
			*)
				usage
				exit 1
				;;
			esac
			;;
		d)
			data_dir="$OPTARG"

			mkdir -p "$OPTARG" > /dev/null 2>&1 ||
				exit 1
			;;
		h)
			usage
			exit
			;;
		?)
			usage
			exit 1
			;;
		esac
	done

	for sub in $(get_subs_${list_fun}); do
		soutlinks "$sub" "$pages"
	done | sort -u > "$temp_fl"

	clean "$out" "$temp_fl"
	rm "$temp_fl"
}

main "$@"
#!/bin/sh

data_dir="data"

usage()
{
	cat <<- EOF
	Usage: ${0##*/} [options]
	Scrape reddit to block new nsfw domains

	Options:
	  -o    outfile, (default hosts)
	  -p    number of pages to scrape, (default 10)
	  -d    data directory, (default ./data)
	  -l    subreddit list to scrape, (default v2)
	        v1 (1k subreddits)
	        v2 (8k subreddits)
	        bak (use the list in ${data_dir}/subs.bak.gz)

	EOF
}

note()
{
	# usage note [color] [string1] [string2] ...
	: "${2:?}"
	color="${1:?}"
	shift

	case "$color" in
	"green")
		printf "\033[32;1m%b\033[0m" "$*" 1>&2
		;;
	"yellow")
		printf "\033[33;1m%b\033[0m" "$*" >&2
		;;
	"red")
		printf "\033[31;1m%b\033[0m" "$*" 1>&2
		;;
	*)
		printf "%b" "$*" 1>&2
		;;
	esac
}

dep_check()
{
	# usage: cmd_check cmd_1 ...
	: "${1:?}"

	for dep; do
		if ! command -v "$dep" 1>/dev/null; then
			note "red" "err: $dep not found, please install it\n"
			exit 127
		fi
	done

	unset dep
}

get_curl()
{
	curl "$@" \
		--silent \
		--compressed \
		--header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0'
}

get_reddit()
{
	# 'Cookie:_options={"pref_gated_sr_optin": true, "pref_quarantine_optin": true}; over18=1'
	get_curl "$1" \
		--header 'Cookie: _options=%7B%22pref_gated_sr_optin%22%3A%20true%2C%20%22pref_quarantine_optin%22%3A%20true%7D; over18=1' \
		--write-out "%{http_code}"
}

get_subs_v1()
{
	note "white" "scrapping nsfw subreddit list\n"
	for page in \# $(seq 2 5); do
		get_reddit "https://redditlist.com/nsfw${page}.html" |
			grep -o 'reddit.com/r/[^"]*' |
			grep -v  '^reddit.com/r/Redditlist/$'
	done | sort -u
}

get_subs_v2()
{
	data=
	page=1

	note "white" "scrapping nsfw subreddit list (v2)\n"
	while true; do
		data="$(get_curl "https://be.mysubs.cc/api/subs/?page=${page}")"

		for sub in $( echo "$data" | jq -r '.results[].name'); do
			printf "reddit.com/r/%s\n" "$sub"
		done

		if [ -n "$data" ]; then
			page=$((page + 1))

			[ "$(echo "$data" | jq  -r '.next')" = "null" ] &&
				break;
		fi

	done
}

get_subs_bak()
{
	zcat "${data_dir}/subs.bak.gz"
}

soutlinks()
{
	# usage post_otlinks "reddit.com/r/subreddit" [pages]
	: "${1:?}"
	pages="${2:?}"
	page=0

	note "white" "${1#*/} "
	while [ "$page" -lt "$pages" ] ; do
		data="$(get_reddit "https://old.${1}/?count=$((page * 25))")" &&
			page=$((page + 1))

		if [ -n "$data" ]; then
			:
			case "$data" in
			*"404")
				note "yellow" " banned subreddit"
				break
				;;
			*"451")
			 	note "yellow" " banned in your country"
				break
				;;
			*"403")
				note "red" " private subreddit"
				break
				;;
			*"429")
			 	note "red" "..too many requests"
				page=$((page - 1))
				continue
				;;
			*"200")
				;;
			*)
			 	note "red" "..something went wrong"
				page=$((page - 1))
				continue
				;;
			esac
		else
			note "red" "..empty"
			continue
		fi

		echo "$data" |
			grep -Eo 'class="title may-blank outbound" [^ ]* href="https?://[^"]*' |
			sed -E -e 's/.*href="https?:\/\///g' -e 's/\/.*//g'
		note "green" "..${page}"
	done
	note "white" "\n"
}

make_hosts()
{
	# usage make_hosts [outfile]
	: "${1:?}"

	while read -r domain; do
		printf "0.0.0.0 %s\n" "$domain"
	done | tee "$1"
}

clean()
{
	# usage clean [outfile] [domain_list]
	: "${1:?}"
	: "${2:?}"

	note "white" "removing duplicates and known good domains\n"

	curl https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts \
		-o "${data_dir}/pornography-hosts"

	while read -r domain; do
		! grep -i "$domain" "${data_dir}/pornography-hosts" "${data_dir}/whitelist" > /dev/null &&
			echo "$domain"
	done < "$2" | make_hosts "$1"
}


main()
{
	out="hosts"
	pages=10
	list_fun="v2"
	temp_fl="$(mktemp "${TMPDIR:-/tmp}/redditorsbtfo.XXX")"

	dep_check curl jq grep sed

	while getopts "ho:p:l:d:" f; do
		case "$f" in
		o)
			out="$OPTARG"
			;;
		p)
			pages="$OPTARG"
			;;
		l)
			case "$OPTARG" in
			"v1")
				list_fun="v1"
				;;
			"v2")
				list_fun="v2"
				;;
			"bak")
				list_fun="bak"
				;;
			*)
				usage
				exit 1
				;;
			esac
			;;
		d)
			data_dir="$OPTARG"

			mkdir -p "$OPTARG" > /dev/null 2>&1 ||
				exit 1
			;;
		h)
			usage
			exit
			;;
		?)
			usage
			exit 1
			;;
		esac
	done

	for sub in $(get_subs_${list_fun}); do
		soutlinks "$sub" "$pages"
	done | sort -u > "$temp_fl"

	clean "$out" "$temp_fl"
	rm "$temp_fl"
}

main "$@"