blob: 4798d7f116ce53e2adc9dcb1ae29226b22ca8932 (
plain) (
tree)
|
|
#!/bin/sh
data_dir="data"
usage()
{
cat <<- EOF
Usage: ${0##*/} [options]
Scrape reddit to block new nsfw domains
Options:
-o outfile, (default hosts)
-p number of pages to scrape, (default 10)
-d data directory, (default ./data)
-l subreddit list to scrape, (default v2)
v1 (1k subreddits)
v2 (8k subreddits)
bak (use the list in ${data_dir}/subs.bak.gz)
EOF
}
note()
{
# usage note [color] [string1] [string2] ...
: "${2:?}"
color="${1:?}"
shift
case "$color" in
"green")
printf "\033[32;1m%b\033[0m" "$*" 1>&2
;;
"yellow")
printf "\033[33;1m%b\033[0m" "$*" >&2
;;
"red")
printf "\033[31;1m%b\033[0m" "$*" 1>&2
;;
*)
printf "%b" "$*" 1>&2
;;
esac
}
dep_check()
{
# usage: cmd_check cmd_1 ...
: "${1:?}"
for dep; do
if ! command -v "$dep" 1>/dev/null; then
note "red" "err: $dep not found, please install it\n"
exit 127
fi
done
unset dep
}
get_curl()
{
curl "$@" \
--silent \
--compressed \
--header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0'
}
get_reddit()
{
# 'Cookie:_options={"pref_gated_sr_optin": true, "pref_quarantine_optin": true}; over18=1'
get_curl "$1" \
--header 'Cookie: _options=%7B%22pref_gated_sr_optin%22%3A%20true%2C%20%22pref_quarantine_optin%22%3A%20true%7D; over18=1' \
--write-out "%{http_code}"
}
get_subs_v1()
{
note "white" "scrapping nsfw subreddit list\n"
for page in \# $(seq 2 5); do
get_reddit "https://redditlist.com/nsfw${page}.html" |
grep -o 'reddit.com/r/[^"]*' |
grep -v '^reddit.com/r/Redditlist/$'
done | sort -u
}
get_subs_v2()
{
data=
page=1
note "white" "scrapping nsfw subreddit list (v2)\n"
while true; do
data="$(get_curl "https://be.mysubs.cc/api/subs/?page=${page}")"
for sub in $( echo "$data" | jq -r '.results[].name'); do
printf "reddit.com/r/%s\n" "$sub"
done
if [ -n "$data" ]; then
page=$((page + 1))
[ "$(echo "$data" | jq -r '.next')" = "null" ] &&
break;
fi
done
}
get_subs_bak()
{
zcat "${data_dir}/subs.bak.gz"
}
soutlinks()
{
# usage post_otlinks "reddit.com/r/subreddit" [pages]
: "${1:?}"
pages="${2:?}"
page=0
note "white" "${1#*/} "
while [ "$page" -lt "$pages" ] ; do
data="$(get_reddit "https://old.${1}/?count=$((page * 25))")" &&
page=$((page + 1))
if [ -n "$data" ]; then
:
case "$data" in
*"404")
note "yellow" " banned subreddit"
break
;;
*"451")
note "yellow" " banned in your country"
break
;;
*"403")
note "red" " private subreddit"
break
;;
*"429")
note "red" "..too many requests"
page=$((page - 1))
continue
;;
*"200")
;;
*)
note "red" "..something went wrong"
page=$((page - 1))
continue
;;
esac
else
note "red" "..empty"
continue
fi
echo "$data" |
grep -Eo 'class="title may-blank outbound" [^ ]* href="https?://[^"]*' |
sed -E -e 's/.*href="https?:\/\///g' -e 's/\/.*//g'
note "green" "..${page}"
done
note "white" "\n"
}
make_hosts()
{
# usage make_hosts [outfile]
: "${1:?}"
while read -r domain; do
printf "0.0.0.0 %s\n" "$domain"
done | tee "$1"
}
clean()
{
# usage clean [outfile] [domain_list]
: "${1:?}"
: "${2:?}"
note "white" "removing duplicates and known good domains\n"
curl https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts \
-o "${data_dir}/pornography-hosts"
while read -r domain; do
! grep -i "$domain" "${data_dir}/pornography-hosts" "${data_dir}/whitelist" > /dev/null &&
echo "$domain"
done < "$2" | make_hosts "$1"
}
main()
{
out="hosts"
pages=10
list_fun="v2"
temp_fl="$(mktemp "${TMPDIR:-/tmp}/redditorsbtfo.XXX")"
dep_check curl jq grep sed
while getopts "ho:p:l:d:" f; do
case "$f" in
o)
out="$OPTARG"
;;
p)
pages="$OPTARG"
;;
l)
case "$OPTARG" in
"v1")
list_fun="v1"
;;
"v2")
list_fun="v2"
;;
"bak")
list_fun="bak"
;;
*)
usage
exit 1
;;
esac
;;
d)
data_dir="$OPTARG"
mkdir -p "$OPTARG" > /dev/null 2>&1 ||
exit 1
;;
h)
usage
exit
;;
?)
usage
exit 1
;;
esac
done
for sub in $(get_subs_${list_fun}); do
soutlinks "$sub" "$pages"
done | sort -u > "$temp_fl"
clean "$out" "$temp_fl"
rm "$temp_fl"
}
main "$@"
|