From 994594cd6809397470b3050780b58703297b84a4 Mon Sep 17 00:00:00 2001 From: Manny Dinssa <44172848+Dinssa@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:05:12 +0100 Subject: [PATCH] Clean source files (simplify) --- config/csv/definitions/definitions_clean.rb | 139 ------------------ .../lettings_support_download_23_24.csv | 0 .../lettings_support_download_24_25.csv | 0 .../lettings_user_download_23_24.csv | 0 .../lettings_user_download_24_25.csv | 0 .../sales_support_download_23_24.csv | 0 .../sales_support_download_24_25.csv | 0 .../sales_user_download_23_24.csv | 0 .../sales_user_download_24_25.csv | 0 9 files changed, 139 deletions(-) delete mode 100644 config/csv/definitions/definitions_clean.rb rename config/csv/definitions/{cleaned => }/lettings_support_download_23_24.csv (100%) rename config/csv/definitions/{cleaned => }/lettings_support_download_24_25.csv (100%) rename config/csv/definitions/{cleaned => }/lettings_user_download_23_24.csv (100%) rename config/csv/definitions/{cleaned => }/lettings_user_download_24_25.csv (100%) rename config/csv/definitions/{cleaned => }/sales_support_download_23_24.csv (100%) rename config/csv/definitions/{cleaned => }/sales_support_download_24_25.csv (100%) rename config/csv/definitions/{cleaned => }/sales_user_download_23_24.csv (100%) rename config/csv/definitions/{cleaned => }/sales_user_download_24_25.csv (100%) diff --git a/config/csv/definitions/definitions_clean.rb b/config/csv/definitions/definitions_clean.rb deleted file mode 100644 index ab372bb9b..000000000 --- a/config/csv/definitions/definitions_clean.rb +++ /dev/null @@ -1,139 +0,0 @@ -require "csv" -require "tempfile" -require "fileutils" - -DIRECTORY = "config/csv/definitions" -ORIGINAL_DIRECTORY = File.join(DIRECTORY, "original") -FORMATTED_DIRECTORY = File.join(DIRECTORY, "formatted") -CLEANED_DIRECTORY = File.join(DIRECTORY, "cleaned") -DELIMITER = "," -REPLACEMENT_CHAR = ";" - -# Common errors that are found in the CSV files have the invalid character '�' replaced with the correct character or removed -COMMON_ERRORS = { - "tenant�s" => "'", - "Don�t" => "'", - "household�s" => "'", - "buyer�s" => "'", - "3�s" => "'", - "4�s" => "'", - "5�s" => "'", - "6�s" => "'", - "a �local" => "", - "postcode�" => "", - "confirmed: �You" => "", - "given �reasonable" => "", - "preference� by" => "", - "confirmed: �Are" => "", - "postcode�,,," => "", - "expect.�" => "", - "is %{age}.�" => "", - "agreement�?" => "", -} -def clean_csv_file(file) - original_path = File.join(ORIGINAL_DIRECTORY, file) - content = File.read(original_path).encode("UTF-8", invalid: :replace, undef: :replace, replace: '�') - - updated_content = "" - content.each_line do |line| - # Replace common errors - if line.include?("�") - COMMON_ERRORS.each do |error, correction| - error_regex = Regexp.new(error.gsub('�', '\�')) - if line =~ error_regex - line.gsub!('�', correction) - end - end - end - - # Replace uncommon errors with user input - if line.include?("�") - puts "Line with unrecognised symbol: #{line}" - line.chars.each_with_index do |char, index| - if char == "�" - first_before_space = line.rindex(' ', index - 1) || 0 - before_space = line.rindex(' ', first_before_space - 1) || first_before_space - first_after_space = line.index(' ', index + 1) || line.length - after_space = line.index(' ', first_after_space + 1) || line.length - before_word = line[before_space...index].strip - after_word = line[index+1..after_space].strip - - context = if index == line.length - 1 - "#{before_word}�" - else - "#{before_word} �#{after_word}" - end - - puts "Context: '#{context}'. Choose a replacement for '�':" - puts "1. Blank (just remove)" - puts "2. ' (single quote)" - puts "3. Space" - puts "Type your choice or enter a replacement:" - choice = STDIN.gets.chomp - replacement = case choice - when "1" - "" - when "2" - "'" - when "3" - " " - else - choice - end - line[index] = replacement - end - end - end - - # if line.count(",") > 1 - # first_comma_index = line.index(",") - # line = line[0..first_comma_index] + line[(first_comma_index + 1)..-1].gsub(",", REPLACEMENT_CHAR) - # end - - updated_content << line - end - - - temp_file = Tempfile.new - temp_file.write(updated_content) - temp_file.close - - temp_file.path -end - -FileUtils.mkdir_p(FORMATTED_DIRECTORY) -FileUtils.mkdir_p(CLEANED_DIRECTORY) - -# filenames = %w[lettings_support_download_23_24.csv lettings_support_download_24_25.csv lettings_user_download_23_24.csv lettings_user_download_24_25.csv sales_support_download_23_24.csv sales_support_download_24_25.csv sales_user_download_23_24.csv sales_user_download_24_25.csv] -filenames = Dir.entries(ORIGINAL_DIRECTORY).select { |f| File.extname(f) == ".csv" } - -filenames.each do |filename| - cleaned_file_path = clean_csv_file(filename) - - first_values = [] - second_values = [] - - CSV.foreach(cleaned_file_path) do |row| - first_values << row[0] unless row[0].nil? - second_values << row[1] unless row[1].nil? - end - - puts "File #{filename} has been cleaned" - puts "First values (count): #{first_values.count}" - puts "Second values (count): #{second_values.count}" - - output_csv_path = File.join(FORMATTED_DIRECTORY, filename.to_s) - cleaned_output_path = File.join(CLEANED_DIRECTORY, filename.to_s) - - first_line = first_values.join(DELIMITER) - second_line = second_values.join(DELIMITER) - - File.open(output_csv_path, "w") do |file| - file.puts first_line - file.puts second_line - end - - FileUtils.cp(cleaned_file_path, cleaned_output_path) - - puts "CSV has been outputted to #{output_csv_path} and cleaned file has been copied to #{cleaned_output_path}" -end diff --git a/config/csv/definitions/cleaned/lettings_support_download_23_24.csv b/config/csv/definitions/lettings_support_download_23_24.csv similarity index 100% rename from config/csv/definitions/cleaned/lettings_support_download_23_24.csv rename to config/csv/definitions/lettings_support_download_23_24.csv diff --git a/config/csv/definitions/cleaned/lettings_support_download_24_25.csv b/config/csv/definitions/lettings_support_download_24_25.csv similarity index 100% rename from config/csv/definitions/cleaned/lettings_support_download_24_25.csv rename to config/csv/definitions/lettings_support_download_24_25.csv diff --git a/config/csv/definitions/cleaned/lettings_user_download_23_24.csv b/config/csv/definitions/lettings_user_download_23_24.csv similarity index 100% rename from config/csv/definitions/cleaned/lettings_user_download_23_24.csv rename to config/csv/definitions/lettings_user_download_23_24.csv diff --git a/config/csv/definitions/cleaned/lettings_user_download_24_25.csv b/config/csv/definitions/lettings_user_download_24_25.csv similarity index 100% rename from config/csv/definitions/cleaned/lettings_user_download_24_25.csv rename to config/csv/definitions/lettings_user_download_24_25.csv diff --git a/config/csv/definitions/cleaned/sales_support_download_23_24.csv b/config/csv/definitions/sales_support_download_23_24.csv similarity index 100% rename from config/csv/definitions/cleaned/sales_support_download_23_24.csv rename to config/csv/definitions/sales_support_download_23_24.csv diff --git a/config/csv/definitions/cleaned/sales_support_download_24_25.csv b/config/csv/definitions/sales_support_download_24_25.csv similarity index 100% rename from config/csv/definitions/cleaned/sales_support_download_24_25.csv rename to config/csv/definitions/sales_support_download_24_25.csv diff --git a/config/csv/definitions/cleaned/sales_user_download_23_24.csv b/config/csv/definitions/sales_user_download_23_24.csv similarity index 100% rename from config/csv/definitions/cleaned/sales_user_download_23_24.csv rename to config/csv/definitions/sales_user_download_23_24.csv diff --git a/config/csv/definitions/cleaned/sales_user_download_24_25.csv b/config/csv/definitions/sales_user_download_24_25.csv similarity index 100% rename from config/csv/definitions/cleaned/sales_user_download_24_25.csv rename to config/csv/definitions/sales_user_download_24_25.csv