Add crawler scripts

This commit is contained in:
Hugo Peixoto 2023-07-28 10:47:33 +01:00
parent b5516e7d82
commit 5dd3d58072
6 changed files with 124 additions and 0 deletions

11
get-category.sh Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
TERM="$1"
PAGES="$2"
TOKEN="581a26491893263804e888240fad1cf7"
for PAGE in $(seq 1 $PAGES); do
echo "category: $TERM.$PAGE";
./get.sh category "$TERM" "$PAGE" "$TOKEN" > "data/category-$TERM.$PAGE.json"
done

19
get-failures.sh Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
TOKEN="581a26491893263804e888240fad1cf7"
echo "not working" >&2
exit 1;
cat failures.txt |
while read TERM PAGE; do
echo "$TERM.$PAGE";
curl -s \
"https://videos.sapo.pt/ajax/search?q=$TERM&type=videos&token=$TOKEN&nocache=9638&page=$PAGE&order=rel" \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
-H 'Accept-Language: en-US' \
-H 'Accept-Encoding: gzip, deflate, br' \
-H 'X-Requested-With: XMLHttpRequest' \
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN" -o "data/$TERM.$PAGE.json"
done

11
get-search.sh Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
TERM="$1"
PAGES="$2"
TOKEN="581a26491893263804e888240fad1cf7"
for PAGE in $(seq 1 $PAGES); do
echo "search: $TERM.$PAGE";
./get.sh search "$TERM" "$PAGE" "$TOKEN" > "data/$TERM.$PAGE.json"
done

15
get-tag.sh Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
TERM="$1"
ENCTERM="$(echo -n "$TERM" | xxd -p)"
TOKEN="581a26491893263804e888240fad1cf7"
PAGES="$(./get.sh tag "$TERM" "1" "$TOKEN" | jq .total_pages)"
echo $ENCTERM $PAGES >&2
for PAGE in $(seq 1 $PAGES); do
echo "tag: $TERM.$PAGE";
./get.sh tag "$TERM" "$PAGE" "$TOKEN" > "data/tag-$ENCTERM.$PAGE.json"
done

15
get-username.sh Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
TERM="$1"
ENCTERM="$(echo -n "$TERM" | xxd -p)"
TOKEN="581a26491893263804e888240fad1cf7"
PAGES="$(./get.sh username "$TERM" "1" "$TOKEN" | jq .total_pages)"
echo $ENCTERM $PAGES >&2
for PAGE in $(seq 1 $PAGES); do
echo "username: $TERM.$PAGE" >&2
./get.sh username "$TERM" "$PAGE" "$TOKEN" > "data/username.$ENCTERM.$PAGE.json"
done

53
get.sh Executable file
View File

@ -0,0 +1,53 @@
#!/usr/bin/env bash
TYPE="$1"
TERM="$2"
PAGE="$3"
TOKEN="$4"
case "$TYPE" in
search)
curl -s \
"https://videos.sapo.pt/ajax/search?q=$TERM&type=videos&token=$TOKEN&nocache=9638&page=$PAGE&order=rel" \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
-H 'Accept-Language: en-US' \
-H 'Accept-Encoding: gzip, deflate, br' \
-H 'X-Requested-With: XMLHttpRequest' \
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
;;
category)
curl -s \
"https://videos.sapo.pt/ajax/category/$TERM?token=$TOKEN&nocache=9544&page=$PAGE&order=releve" \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
-H 'Accept-Language: en-US' \
-H 'Accept-Encoding: gzip, deflate, br' \
-H 'X-Requested-With: XMLHttpRequest' \
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
;;
username)
curl -s \
"https://videos.sapo.pt/ajax/video?username=$TERM&token=$TOKEN&page=$PAGE" \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
-H 'Accept-Language: en-US' \
-H 'Accept-Encoding: gzip, deflate, br' \
-H 'X-Requested-With: XMLHttpRequest' \
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
;;
tag)
curl -s \
"https://videos.sapo.pt/ajax/searchbytag?q=$TERM&type=videos&token=$TOKEN&page=$PAGE&order=releve&limit=20" \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
-H 'Accept-Language: en-US' \
-H 'Accept-Encoding: gzip, deflate, br' \
-H 'X-Requested-With: XMLHttpRequest' \
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
;;
*)
exit 1;
esac