Add crawler scripts
This commit is contained in:
parent
b5516e7d82
commit
5dd3d58072
11
get-category.sh
Executable file
11
get-category.sh
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
TERM="$1"
|
||||
PAGES="$2"
|
||||
|
||||
TOKEN="581a26491893263804e888240fad1cf7"
|
||||
|
||||
for PAGE in $(seq 1 $PAGES); do
|
||||
echo "category: $TERM.$PAGE";
|
||||
./get.sh category "$TERM" "$PAGE" "$TOKEN" > "data/category-$TERM.$PAGE.json"
|
||||
done
|
19
get-failures.sh
Executable file
19
get-failures.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
TOKEN="581a26491893263804e888240fad1cf7"
|
||||
|
||||
echo "not working" >&2
|
||||
exit 1;
|
||||
|
||||
cat failures.txt |
|
||||
while read TERM PAGE; do
|
||||
echo "$TERM.$PAGE";
|
||||
curl -s \
|
||||
"https://videos.sapo.pt/ajax/search?q=$TERM&type=videos&token=$TOKEN&nocache=9638&page=$PAGE&order=rel" \
|
||||
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
|
||||
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
|
||||
-H 'Accept-Language: en-US' \
|
||||
-H 'Accept-Encoding: gzip, deflate, br' \
|
||||
-H 'X-Requested-With: XMLHttpRequest' \
|
||||
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN" -o "data/$TERM.$PAGE.json"
|
||||
done
|
11
get-search.sh
Executable file
11
get-search.sh
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
TERM="$1"
|
||||
PAGES="$2"
|
||||
|
||||
TOKEN="581a26491893263804e888240fad1cf7"
|
||||
|
||||
for PAGE in $(seq 1 $PAGES); do
|
||||
echo "search: $TERM.$PAGE";
|
||||
./get.sh search "$TERM" "$PAGE" "$TOKEN" > "data/$TERM.$PAGE.json"
|
||||
done
|
15
get-tag.sh
Executable file
15
get-tag.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
TERM="$1"
|
||||
ENCTERM="$(echo -n "$TERM" | xxd -p)"
|
||||
|
||||
TOKEN="581a26491893263804e888240fad1cf7"
|
||||
|
||||
PAGES="$(./get.sh tag "$TERM" "1" "$TOKEN" | jq .total_pages)"
|
||||
|
||||
echo $ENCTERM $PAGES >&2
|
||||
|
||||
for PAGE in $(seq 1 $PAGES); do
|
||||
echo "tag: $TERM.$PAGE";
|
||||
./get.sh tag "$TERM" "$PAGE" "$TOKEN" > "data/tag-$ENCTERM.$PAGE.json"
|
||||
done
|
15
get-username.sh
Executable file
15
get-username.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
TERM="$1"
|
||||
ENCTERM="$(echo -n "$TERM" | xxd -p)"
|
||||
|
||||
TOKEN="581a26491893263804e888240fad1cf7"
|
||||
|
||||
PAGES="$(./get.sh username "$TERM" "1" "$TOKEN" | jq .total_pages)"
|
||||
|
||||
echo $ENCTERM $PAGES >&2
|
||||
|
||||
for PAGE in $(seq 1 $PAGES); do
|
||||
echo "username: $TERM.$PAGE" >&2
|
||||
./get.sh username "$TERM" "$PAGE" "$TOKEN" > "data/username.$ENCTERM.$PAGE.json"
|
||||
done
|
53
get.sh
Executable file
53
get.sh
Executable file
@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
TYPE="$1"
|
||||
TERM="$2"
|
||||
PAGE="$3"
|
||||
TOKEN="$4"
|
||||
|
||||
case "$TYPE" in
|
||||
search)
|
||||
curl -s \
|
||||
"https://videos.sapo.pt/ajax/search?q=$TERM&type=videos&token=$TOKEN&nocache=9638&page=$PAGE&order=rel" \
|
||||
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
|
||||
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
|
||||
-H 'Accept-Language: en-US' \
|
||||
-H 'Accept-Encoding: gzip, deflate, br' \
|
||||
-H 'X-Requested-With: XMLHttpRequest' \
|
||||
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
|
||||
;;
|
||||
|
||||
category)
|
||||
curl -s \
|
||||
"https://videos.sapo.pt/ajax/category/$TERM?token=$TOKEN&nocache=9544&page=$PAGE&order=releve" \
|
||||
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
|
||||
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
|
||||
-H 'Accept-Language: en-US' \
|
||||
-H 'Accept-Encoding: gzip, deflate, br' \
|
||||
-H 'X-Requested-With: XMLHttpRequest' \
|
||||
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
|
||||
;;
|
||||
|
||||
username)
|
||||
curl -s \
|
||||
"https://videos.sapo.pt/ajax/video?username=$TERM&token=$TOKEN&page=$PAGE" \
|
||||
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
|
||||
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
|
||||
-H 'Accept-Language: en-US' \
|
||||
-H 'Accept-Encoding: gzip, deflate, br' \
|
||||
-H 'X-Requested-With: XMLHttpRequest' \
|
||||
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
|
||||
;;
|
||||
tag)
|
||||
curl -s \
|
||||
"https://videos.sapo.pt/ajax/searchbytag?q=$TERM&type=videos&token=$TOKEN&page=$PAGE&order=releve&limit=20" \
|
||||
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' \
|
||||
-H 'Accept: text/javascript,text/xml,application/xml,application/xhtml+xml,text/html,application/json;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1' \
|
||||
-H 'Accept-Language: en-US' \
|
||||
-H 'Accept-Encoding: gzip, deflate, br' \
|
||||
-H 'X-Requested-With: XMLHttpRequest' \
|
||||
-H "Cookie: language=pt; sso_tld=POR; bsu-v3-api=1689792348549; sv_token=$TOKEN"
|
||||
;;
|
||||
*)
|
||||
exit 1;
|
||||
esac
|
Loading…
Reference in New Issue
Block a user