diff --git a/count.rb b/count.rb new file mode 100755 index 0000000..648e6b4 --- /dev/null +++ b/count.rb @@ -0,0 +1,37 @@ +#!/usr/bin/env ruby + +require 'json' + +videos = {} +total = 0 + +failures = [] + +Dir["data/*.json"].each do |f| + data = JSON.parse(File.read(f))["data"] + + data.each { |x| videos[x["id"]] = x } + total += data.size +rescue + failures << File.basename(f) + pp "error parsing #{f}" +end + +pp videos.size +pp total + +File.write("videos.csv", videos.values.map { |video| "#{video["id"]},#{video["randname"]}\n" }.join) +File.write("videos.jsonl", videos.values.map { |video| JSON.dump(video) + "\n" }.join) +File.write("failures.txt", failures.map { |f| a, b = f.split("."); "#{a} #{b}\n" }.join) + +pp [videos.keys.min, videos.keys.max] + +class Fixnum + def mib; self * 2**20; end + def gib; self * 2**30; end + def tib; self * 2**40; end +end + +puts "estimating 100mb per video:" +puts "so far: #{videos.size * 100.mib / 1.tib} TiB" +puts "potential: #{videos.values.map { |v| v["id"] }.max * 100.mib / 1.tib} TB"