|
|
|
|
@ -1,139 +0,0 @@
|
|
|
|
|
require "csv" |
|
|
|
|
require "tempfile" |
|
|
|
|
require "fileutils" |
|
|
|
|
|
|
|
|
|
DIRECTORY = "config/csv/definitions" |
|
|
|
|
ORIGINAL_DIRECTORY = File.join(DIRECTORY, "original") |
|
|
|
|
FORMATTED_DIRECTORY = File.join(DIRECTORY, "formatted") |
|
|
|
|
CLEANED_DIRECTORY = File.join(DIRECTORY, "cleaned") |
|
|
|
|
DELIMITER = "," |
|
|
|
|
REPLACEMENT_CHAR = ";" |
|
|
|
|
|
|
|
|
|
# Common errors that are found in the CSV files have the invalid character '<EFBFBD>' replaced with the correct character or removed |
|
|
|
|
COMMON_ERRORS = { |
|
|
|
|
"tenant<EFBFBD>s" => "'", |
|
|
|
|
"Don<EFBFBD>t" => "'", |
|
|
|
|
"household<EFBFBD>s" => "'", |
|
|
|
|
"buyer<EFBFBD>s" => "'", |
|
|
|
|
"3<EFBFBD>s" => "'", |
|
|
|
|
"4<EFBFBD>s" => "'", |
|
|
|
|
"5<EFBFBD>s" => "'", |
|
|
|
|
"6<EFBFBD>s" => "'", |
|
|
|
|
"a <EFBFBD>local" => "", |
|
|
|
|
"postcode<EFBFBD>" => "", |
|
|
|
|
"confirmed: <EFBFBD>You" => "", |
|
|
|
|
"given <EFBFBD>reasonable" => "", |
|
|
|
|
"preference<EFBFBD> by" => "", |
|
|
|
|
"confirmed: <EFBFBD>Are" => "", |
|
|
|
|
"postcode<EFBFBD>,,," => "", |
|
|
|
|
"expect.<EFBFBD>" => "", |
|
|
|
|
"is %{age}.<EFBFBD>" => "", |
|
|
|
|
"agreement<EFBFBD>?" => "", |
|
|
|
|
} |
|
|
|
|
def clean_csv_file(file) |
|
|
|
|
original_path = File.join(ORIGINAL_DIRECTORY, file) |
|
|
|
|
content = File.read(original_path).encode("UTF-8", invalid: :replace, undef: :replace, replace: '<EFBFBD>') |
|
|
|
|
|
|
|
|
|
updated_content = "" |
|
|
|
|
content.each_line do |line| |
|
|
|
|
# Replace common errors |
|
|
|
|
if line.include?("<EFBFBD>") |
|
|
|
|
COMMON_ERRORS.each do |error, correction| |
|
|
|
|
error_regex = Regexp.new(error.gsub('<EFBFBD>', '\<EFBFBD>')) |
|
|
|
|
if line =~ error_regex |
|
|
|
|
line.gsub!('<EFBFBD>', correction) |
|
|
|
|
end |
|
|
|
|
end |
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
# Replace uncommon errors with user input |
|
|
|
|
if line.include?("<EFBFBD>") |
|
|
|
|
puts "Line with unrecognised symbol: #{line}" |
|
|
|
|
line.chars.each_with_index do |char, index| |
|
|
|
|
if char == "<EFBFBD>" |
|
|
|
|
first_before_space = line.rindex(' ', index - 1) || 0 |
|
|
|
|
before_space = line.rindex(' ', first_before_space - 1) || first_before_space |
|
|
|
|
first_after_space = line.index(' ', index + 1) || line.length |
|
|
|
|
after_space = line.index(' ', first_after_space + 1) || line.length |
|
|
|
|
before_word = line[before_space...index].strip |
|
|
|
|
after_word = line[index+1..after_space].strip |
|
|
|
|
|
|
|
|
|
context = if index == line.length - 1 |
|
|
|
|
"#{before_word}<EFBFBD>" |
|
|
|
|
else |
|
|
|
|
"#{before_word} <EFBFBD>#{after_word}" |
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
puts "Context: '#{context}'. Choose a replacement for '<EFBFBD>':" |
|
|
|
|
puts "1. Blank (just remove)" |
|
|
|
|
puts "2. ' (single quote)" |
|
|
|
|
puts "3. Space" |
|
|
|
|
puts "Type your choice or enter a replacement:" |
|
|
|
|
choice = STDIN.gets.chomp |
|
|
|
|
replacement = case choice |
|
|
|
|
when "1" |
|
|
|
|
"" |
|
|
|
|
when "2" |
|
|
|
|
"'" |
|
|
|
|
when "3" |
|
|
|
|
" " |
|
|
|
|
else |
|
|
|
|
choice |
|
|
|
|
end |
|
|
|
|
line[index] = replacement |
|
|
|
|
end |
|
|
|
|
end |
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
# if line.count(",") > 1 |
|
|
|
|
# first_comma_index = line.index(",") |
|
|
|
|
# line = line[0..first_comma_index] + line[(first_comma_index + 1)..-1].gsub(",", REPLACEMENT_CHAR) |
|
|
|
|
# end |
|
|
|
|
|
|
|
|
|
updated_content << line |
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
temp_file = Tempfile.new |
|
|
|
|
temp_file.write(updated_content) |
|
|
|
|
temp_file.close |
|
|
|
|
|
|
|
|
|
temp_file.path |
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
FileUtils.mkdir_p(FORMATTED_DIRECTORY) |
|
|
|
|
FileUtils.mkdir_p(CLEANED_DIRECTORY) |
|
|
|
|
|
|
|
|
|
# filenames = %w[lettings_support_download_23_24.csv lettings_support_download_24_25.csv lettings_user_download_23_24.csv lettings_user_download_24_25.csv sales_support_download_23_24.csv sales_support_download_24_25.csv sales_user_download_23_24.csv sales_user_download_24_25.csv] |
|
|
|
|
filenames = Dir.entries(ORIGINAL_DIRECTORY).select { |f| File.extname(f) == ".csv" } |
|
|
|
|
|
|
|
|
|
filenames.each do |filename| |
|
|
|
|
cleaned_file_path = clean_csv_file(filename) |
|
|
|
|
|
|
|
|
|
first_values = [] |
|
|
|
|
second_values = [] |
|
|
|
|
|
|
|
|
|
CSV.foreach(cleaned_file_path) do |row| |
|
|
|
|
first_values << row[0] unless row[0].nil? |
|
|
|
|
second_values << row[1] unless row[1].nil? |
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
puts "File #{filename} has been cleaned" |
|
|
|
|
puts "First values (count): #{first_values.count}" |
|
|
|
|
puts "Second values (count): #{second_values.count}" |
|
|
|
|
|
|
|
|
|
output_csv_path = File.join(FORMATTED_DIRECTORY, filename.to_s) |
|
|
|
|
cleaned_output_path = File.join(CLEANED_DIRECTORY, filename.to_s) |
|
|
|
|
|
|
|
|
|
first_line = first_values.join(DELIMITER) |
|
|
|
|
second_line = second_values.join(DELIMITER) |
|
|
|
|
|
|
|
|
|
File.open(output_csv_path, "w") do |file| |
|
|
|
|
file.puts first_line |
|
|
|
|
file.puts second_line |
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
FileUtils.cp(cleaned_file_path, cleaned_output_path) |
|
|
|
|
|
|
|
|
|
puts "CSV has been outputted to #{output_csv_path} and cleaned file has been copied to #{cleaned_output_path}" |
|
|
|
|
end |