Add more crawler scripts
This commit is contained in:
parent
5dd3d58072
commit
b44c11dadf
37
count.rb
Executable file
37
count.rb
Executable file
@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env ruby
|
||||
|
||||
require 'json'
|
||||
|
||||
videos = {}
|
||||
total = 0
|
||||
|
||||
failures = []
|
||||
|
||||
Dir["data/*.json"].each do |f|
|
||||
data = JSON.parse(File.read(f))["data"]
|
||||
|
||||
data.each { |x| videos[x["id"]] = x }
|
||||
total += data.size
|
||||
rescue
|
||||
failures << File.basename(f)
|
||||
pp "error parsing #{f}"
|
||||
end
|
||||
|
||||
pp videos.size
|
||||
pp total
|
||||
|
||||
File.write("videos.csv", videos.values.map { |video| "#{video["id"]},#{video["randname"]}\n" }.join)
|
||||
File.write("videos.jsonl", videos.values.map { |video| JSON.dump(video) + "\n" }.join)
|
||||
File.write("failures.txt", failures.map { |f| a, b = f.split("."); "#{a} #{b}\n" }.join)
|
||||
|
||||
pp [videos.keys.min, videos.keys.max]
|
||||
|
||||
class Fixnum
|
||||
def mib; self * 2**20; end
|
||||
def gib; self * 2**30; end
|
||||
def tib; self * 2**40; end
|
||||
end
|
||||
|
||||
puts "estimating 100mb per video:"
|
||||
puts "so far: #{videos.size * 100.mib / 1.tib} TiB"
|
||||
puts "potential: #{videos.values.map { |v| v["id"] }.max * 100.mib / 1.tib} TB"
|
Loading…
Reference in New Issue
Block a user