Browse Source

Clean source files (simplify)

pull/2539/head
Manny Dinssa 2 years ago
parent
commit
994594cd68
  1. 139
      config/csv/definitions/definitions_clean.rb
  2. 0
      config/csv/definitions/lettings_support_download_23_24.csv
  3. 0
      config/csv/definitions/lettings_support_download_24_25.csv
  4. 0
      config/csv/definitions/lettings_user_download_23_24.csv
  5. 0
      config/csv/definitions/lettings_user_download_24_25.csv
  6. 0
      config/csv/definitions/sales_support_download_23_24.csv
  7. 0
      config/csv/definitions/sales_support_download_24_25.csv
  8. 0
      config/csv/definitions/sales_user_download_23_24.csv
  9. 0
      config/csv/definitions/sales_user_download_24_25.csv

139
config/csv/definitions/definitions_clean.rb

@ -1,139 +0,0 @@
require "csv"
require "tempfile"
require "fileutils"
DIRECTORY = "config/csv/definitions"
ORIGINAL_DIRECTORY = File.join(DIRECTORY, "original")
FORMATTED_DIRECTORY = File.join(DIRECTORY, "formatted")
CLEANED_DIRECTORY = File.join(DIRECTORY, "cleaned")
DELIMITER = ","
REPLACEMENT_CHAR = ";"
# Common errors that are found in the CSV files have the invalid character '<EFBFBD>' replaced with the correct character or removed
COMMON_ERRORS = {
"tenant<EFBFBD>s" => "'",
"Don<EFBFBD>t" => "'",
"household<EFBFBD>s" => "'",
"buyer<EFBFBD>s" => "'",
"3<EFBFBD>s" => "'",
"4<EFBFBD>s" => "'",
"5<EFBFBD>s" => "'",
"6<EFBFBD>s" => "'",
"a <EFBFBD>local" => "",
"postcode<EFBFBD>" => "",
"confirmed: <EFBFBD>You" => "",
"given <EFBFBD>reasonable" => "",
"preference<EFBFBD> by" => "",
"confirmed: <EFBFBD>Are" => "",
"postcode<EFBFBD>,,," => "",
"expect.<EFBFBD>" => "",
"is %{age}.<EFBFBD>" => "",
"agreement<EFBFBD>?" => "",
}
def clean_csv_file(file)
original_path = File.join(ORIGINAL_DIRECTORY, file)
content = File.read(original_path).encode("UTF-8", invalid: :replace, undef: :replace, replace: '<EFBFBD>')
updated_content = ""
content.each_line do |line|
# Replace common errors
if line.include?("<EFBFBD>")
COMMON_ERRORS.each do |error, correction|
error_regex = Regexp.new(error.gsub('<EFBFBD>', '\<EFBFBD>'))
if line =~ error_regex
line.gsub!('<EFBFBD>', correction)
end
end
end
# Replace uncommon errors with user input
if line.include?("<EFBFBD>")
puts "Line with unrecognised symbol: #{line}"
line.chars.each_with_index do |char, index|
if char == "<EFBFBD>"
first_before_space = line.rindex(' ', index - 1) || 0
before_space = line.rindex(' ', first_before_space - 1) || first_before_space
first_after_space = line.index(' ', index + 1) || line.length
after_space = line.index(' ', first_after_space + 1) || line.length
before_word = line[before_space...index].strip
after_word = line[index+1..after_space].strip
context = if index == line.length - 1
"#{before_word}<EFBFBD>"
else
"#{before_word} <EFBFBD>#{after_word}"
end
puts "Context: '#{context}'. Choose a replacement for '<EFBFBD>':"
puts "1. Blank (just remove)"
puts "2. ' (single quote)"
puts "3. Space"
puts "Type your choice or enter a replacement:"
choice = STDIN.gets.chomp
replacement = case choice
when "1"
""
when "2"
"'"
when "3"
" "
else
choice
end
line[index] = replacement
end
end
end
# if line.count(",") > 1
# first_comma_index = line.index(",")
# line = line[0..first_comma_index] + line[(first_comma_index + 1)..-1].gsub(",", REPLACEMENT_CHAR)
# end
updated_content << line
end
temp_file = Tempfile.new
temp_file.write(updated_content)
temp_file.close
temp_file.path
end
FileUtils.mkdir_p(FORMATTED_DIRECTORY)
FileUtils.mkdir_p(CLEANED_DIRECTORY)
# filenames = %w[lettings_support_download_23_24.csv lettings_support_download_24_25.csv lettings_user_download_23_24.csv lettings_user_download_24_25.csv sales_support_download_23_24.csv sales_support_download_24_25.csv sales_user_download_23_24.csv sales_user_download_24_25.csv]
filenames = Dir.entries(ORIGINAL_DIRECTORY).select { |f| File.extname(f) == ".csv" }
filenames.each do |filename|
cleaned_file_path = clean_csv_file(filename)
first_values = []
second_values = []
CSV.foreach(cleaned_file_path) do |row|
first_values << row[0] unless row[0].nil?
second_values << row[1] unless row[1].nil?
end
puts "File #{filename} has been cleaned"
puts "First values (count): #{first_values.count}"
puts "Second values (count): #{second_values.count}"
output_csv_path = File.join(FORMATTED_DIRECTORY, filename.to_s)
cleaned_output_path = File.join(CLEANED_DIRECTORY, filename.to_s)
first_line = first_values.join(DELIMITER)
second_line = second_values.join(DELIMITER)
File.open(output_csv_path, "w") do |file|
file.puts first_line
file.puts second_line
end
FileUtils.cp(cleaned_file_path, cleaned_output_path)
puts "CSV has been outputted to #{output_csv_path} and cleaned file has been copied to #{cleaned_output_path}"
end

0
config/csv/definitions/cleaned/lettings_support_download_23_24.csv → config/csv/definitions/lettings_support_download_23_24.csv

Can't render this file because it has a wrong number of fields in line 20.

0
config/csv/definitions/cleaned/lettings_support_download_24_25.csv → config/csv/definitions/lettings_support_download_24_25.csv

Can't render this file because it has a wrong number of fields in line 25.

0
config/csv/definitions/cleaned/lettings_user_download_23_24.csv → config/csv/definitions/lettings_user_download_23_24.csv

Can't render this file because it has a wrong number of fields in line 17.

0
config/csv/definitions/cleaned/lettings_user_download_24_25.csv → config/csv/definitions/lettings_user_download_24_25.csv

Can't render this file because it has a wrong number of fields in line 18.

0
config/csv/definitions/cleaned/sales_support_download_23_24.csv → config/csv/definitions/sales_support_download_23_24.csv

Can't render this file because it has a wrong number of fields in line 20.

0
config/csv/definitions/cleaned/sales_support_download_24_25.csv → config/csv/definitions/sales_support_download_24_25.csv

Can't render this file because it has a wrong number of fields in line 20.

0
config/csv/definitions/cleaned/sales_user_download_23_24.csv → config/csv/definitions/sales_user_download_23_24.csv

Can't render this file because it has a wrong number of fields in line 19.

0
config/csv/definitions/cleaned/sales_user_download_24_25.csv → config/csv/definitions/sales_user_download_24_25.csv

Can't render this file because it has a wrong number of fields in line 19.
Loading…
Cancel
Save