#!/usr/bin/env ruby require 'json' videos = {} total = 0 failures = [] Dir["data/*.json"].each do |f| data = JSON.parse(File.read(f))["data"] data.each { |x| videos[x["id"]] = x } total += data.size rescue failures << File.basename(f) pp "error parsing #{f}" end pp videos.size pp total File.write("videos.csv", videos.values.map { |video| "#{video["id"]},#{video["randname"]}\n" }.join) File.write("videos.jsonl", videos.values.map { |video| JSON.dump(video) + "\n" }.join) File.write("failures.txt", failures.map { |f| a, b = f.split("."); "#{a} #{b}\n" }.join) pp [videos.keys.min, videos.keys.max] class Fixnum def mib; self * 2**20; end def gib; self * 2**30; end def tib; self * 2**40; end end puts "estimating 100mb per video:" puts "so far: #{videos.size * 100.mib / 1.tib} TiB" puts "potential: #{videos.values.map { |v| v["id"] }.max * 100.mib / 1.tib} TB"