38 lines
893 B
Ruby
38 lines
893 B
Ruby
|
#!/usr/bin/env ruby
|
||
|
|
||
|
require 'json'
|
||
|
|
||
|
videos = {}
|
||
|
total = 0
|
||
|
|
||
|
failures = []
|
||
|
|
||
|
Dir["data/*.json"].each do |f|
|
||
|
data = JSON.parse(File.read(f))["data"]
|
||
|
|
||
|
data.each { |x| videos[x["id"]] = x }
|
||
|
total += data.size
|
||
|
rescue
|
||
|
failures << File.basename(f)
|
||
|
pp "error parsing #{f}"
|
||
|
end
|
||
|
|
||
|
pp videos.size
|
||
|
pp total
|
||
|
|
||
|
File.write("videos.csv", videos.values.map { |video| "#{video["id"]},#{video["randname"]}\n" }.join)
|
||
|
File.write("videos.jsonl", videos.values.map { |video| JSON.dump(video) + "\n" }.join)
|
||
|
File.write("failures.txt", failures.map { |f| a, b = f.split("."); "#{a} #{b}\n" }.join)
|
||
|
|
||
|
pp [videos.keys.min, videos.keys.max]
|
||
|
|
||
|
class Fixnum
|
||
|
def mib; self * 2**20; end
|
||
|
def gib; self * 2**30; end
|
||
|
def tib; self * 2**40; end
|
||
|
end
|
||
|
|
||
|
puts "estimating 100mb per video:"
|
||
|
puts "so far: #{videos.size * 100.mib / 1.tib} TiB"
|
||
|
puts "potential: #{videos.values.map { |v| v["id"] }.max * 100.mib / 1.tib} TB"
|