Add more crawler scripts

This commit is contained in:
Hugo Peixoto 2023-07-28 10:48:07 +01:00
parent 5dd3d58072
commit b44c11dadf
1 changed files with 37 additions and 0 deletions

37
count.rb Executable file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env ruby
require 'json'
videos = {}
total = 0
failures = []
Dir["data/*.json"].each do |f|
data = JSON.parse(File.read(f))["data"]
data.each { |x| videos[x["id"]] = x }
total += data.size
rescue
failures << File.basename(f)
pp "error parsing #{f}"
end
pp videos.size
pp total
File.write("videos.csv", videos.values.map { |video| "#{video["id"]},#{video["randname"]}\n" }.join)
File.write("videos.jsonl", videos.values.map { |video| JSON.dump(video) + "\n" }.join)
File.write("failures.txt", failures.map { |f| a, b = f.split("."); "#{a} #{b}\n" }.join)
pp [videos.keys.min, videos.keys.max]
class Fixnum
def mib; self * 2**20; end
def gib; self * 2**30; end
def tib; self * 2**40; end
end
puts "estimating 100mb per video:"
puts "so far: #{videos.size * 100.mib / 1.tib} TiB"
puts "potential: #{videos.values.map { |v| v["id"] }.max * 100.mib / 1.tib} TB"