Batch processing multiple archives
Overview
Process large numbers of CAB archives efficiently with parallel processing, error handling, and progress reporting.
Simple Batch Extractor
#!/usr/bin/env ruby
require 'cabriolet'
require 'fileutils'
require 'find'
class BatchProcessor
def initialize(input_dir, output_dir)
@input_dir = input_dir
@output_dir = output_dir
@results = { success: [], failed: [], total: 0 }
end
def process_all
cab_files = find_cab_files
puts "Found #{cab_files.count} CAB files to process\n"
cab_files.each_with_index do |cab_file, index|
process_cabinet(cab_file, index + 1, cab_files.count)
end
print_summary
end
private
def find_cab_files
files = []
Find.find(@input_dir) do |path|
files << path if File.file?(path) && path.downcase.end_with?('.cab')
end
files.sort
end
def process_cabinet(cab_file, current, total)
@results[:total] += 1
rel_path = cab_file.sub(@input_dir + '/', '')
print "[#{current}/#{total}] #{rel_path}... "
begin
cabinet = Cabriolet::CAB::Parser.new.parse(cab_file)
output_subdir = File.join(@output_dir, File.basename(cab_file, '.*'))
FileUtils.mkdir_p(output_subdir)
cabinet.files.each do |file|
output_path = File.join(output_subdir, file.name.gsub('\\', '/'))
FileUtils.mkdir_p(File.dirname(output_path))
File.write(output_path, file.data, mode: 'wb')
end
@results[:success] << rel_path
puts "OK (#{cabinet.files.count} files)"
rescue => e
@results[:failed] << { file: rel_path, error: e.message }
puts "FAILED: #{e.message}"
end
end
def print_summary
puts "\n" + "="*60
puts "BATCH PROCESSING SUMMARY"
puts "="*60
puts "Total: #{@results[:total]}"
puts "Success: #{@results[:success].count}"
puts "Failed: #{@results[:failed].count}"
if @results[:failed].any?
puts "\nFailed files:"
@results[:failed].each do |f|
puts " - #{f[:file]}: #{f[:error]}"
end
end
end
end
# Usage
processor = BatchProcessor.new(ARGV[0] || 'cabs', ARGV[1] || 'extracted')
processor.process_allParallel Processing
require 'cabriolet'
require 'parallel'
class ParallelBatchProcessor
def initialize(input_dir, output_dir, threads: 4)
@input_dir = input_dir
@output_dir = output_dir
@threads = threads
end
def process_all
cab_files = Dir.glob(File.join(@input_dir, '**', '*.cab'))
puts "Processing #{cab_files.count} files with #{@threads} threads\n"
results = Parallel.map(cab_files, in_threads: @threads) do |cab_file|
process_one(cab_file)
end
print_results(results)
end
private
def process_one(cab_file)
cabinet = Cabriolet::CAB::Parser.new.parse(cab_file)
output_dir = File.join(@output_dir, File.basename(cab_file, '.*'))
FileUtils.mkdir_p(output_dir)
cabinet.files.each do |file|
output_path = File.join(output_dir, file.name.gsub('\\', '/'))
FileUtils.mkdir_p(File.dirname(output_path))
File.write(output_path, file.data, mode: 'wb')
end
{ file: cab_file, status: :success, count: cabinet.files.count }
rescue => e
{ file: cab_file, status: :failed, error: e.message }
end
def print_results(results)
success = results.count { |r| r[:status] == :success }
failed = results.count { |r| r[:status] == :failed }
puts "\nProcessed: #{results.count}"
puts "Success: #{success}"
puts "Failed: #{failed}"
end
end
# Usage
processor = ParallelBatchProcessor.new('cabs', 'extracted', threads: 8)
processor.process_allProgress Reporting
require 'cabriolet'
require 'ruby-progressbar'
class ProgressBatchProcessor
def initialize(input_dir, output_dir)
@input_dir = input_dir
@output_dir = output_dir
end
def process_all
cab_files = Dir.glob(File.join(@input_dir, '**', '*.cab'))
progress = ProgressBar.create(
title: "Extracting",
total: cab_files.count,
format: '%t: |%B| %p%% %e'
)
cab_files.each do |cab_file|
process_cabinet(cab_file)
progress.increment
end
end
private
def process_cabinet(cab_file)
cabinet = Cabriolet::CAB::Parser.new.parse(cab_file)
output_dir = File.join(@output_dir, File.basename(cab_file, '.*'))
FileUtils.mkdir_p(output_dir)
cabinet.files.each do |file|
output_path = File.join(output_dir, file.name.gsub('\\', '/'))
FileUtils.mkdir_p(File.dirname(output_path))
File.write(output_path, file.data, mode: 'wb')
end
end
end
# Usage
processor = ProgressBatchProcessor.new('cabs', 'extracted')
processor.process_allError Recovery and Logging
require 'cabriolet'
require 'logger'
class RobustBatchProcessor
def initialize(input_dir, output_dir, log_file: 'batch.log')
@input_dir = input_dir
@output_dir = output_dir
@logger = Logger.new(log_file)
@logger.level = Logger::INFO
end
def process_all
cab_files = Dir.glob(File.join(@input_dir, '**', '*.cab'))
@logger.info "Starting batch processing of #{cab_files.count} files"
cab_files.each do |cab_file|
process_with_retry(cab_file, max_retries: 3)
end
@logger.info "Batch processing complete"
end
private
def process_with_retry(cab_file, max_retries: 3)
retries = 0
begin
process_cabinet(cab_file)
@logger.info "Success: #{cab_file}"
rescue => e
retries += 1
if retries < max_retries
@logger.warn "Retry #{retries}/#{max_retries} for #{cab_file}: #{e.message}"
sleep 1
retry
else
@logger.error "Failed after #{max_retries} attempts: #{cab_file} - #{e.message}"
@logger.error e.backtrace.join("\n")
end
end
end
def process_cabinet(cab_file)
cabinet = Cabriolet::CAB::Parser.new.parse(cab_file)
output_dir = File.join(@output_dir, File.basename(cab_file, '.*'))
FileUtils.mkdir_p(output_dir)
cabinet.files.each do |file|
output_path = File.join(output_dir, file.name.gsub('\\', '/'))
FileUtils.mkdir_p(File.dirname(output_path))
File.write(output_path, file.data, mode: 'wb')
end
end
end
# Usage
processor = RobustBatchProcessor.new('cabs', 'extracted', log_file: 'extraction.log')
processor.process_all