Solved Removing duplicate text in a txt file using bat

November 30, 2018 at 19:39:49
Specs: Windows 7
Dear All! I have this script for removing duplicates in large text file. It work as .bat file and when just drop text file on top of .bat window it remove duplicates fast and given out two files with names. One without duplicates and second only duplicates. Problem only it sorts output file alphabetically. I need stop sorting output file alphabetically. I need lines in output files stay same as in begining text file. No sorting. Thank You for help.
Sincerely

/*
@echo off && cls
set WinDirNet=%WinDir%\Microsoft.NET\Framework
IF EXIST "%WinDirNet%\v2.0.50727\csc.exe" set csc="%WinDirNet%\v2.0.50727\csc.exe"
IF EXIST "%WinDirNet%\v3.5\csc.exe" set csc="%WinDirNet%\v3.5\csc.exe"
IF EXIST "%WinDirNet%\v4.0.30319\csc.exe" set csc="%WinDirNet%\v4.0.30319\csc.exe"
%csc% /nologo /out:"%~0.exe" %0
"%~0.exe" %1
del "%~0.exe"
exit
*/

//14 mar 2015 @ 13:29
//metaspamer.blogspot.com

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace spicemustflow_sortcleaner
{
class Program
{
static string GetFilename(string nameWithoutExtension, bool uniq)
{
int n = 1;
string outputFile = string.Empty;
while (true)
{
outputFile = string.Format("{0}_{1}{2}.txt",
nameWithoutExtension, uniq ? "uniques" : "duplicates",
n > 1 ? n.ToString() : string.Empty);
if (File.Exists(outputFile)) n++;
else break;
}
return outputFile;
}

static void Main(string[] args)
{
string path = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\";
string[] filelist = null;
if (args.Length == 0) filelist = Directory.GetFiles(path, "*.txt", SearchOption.TopDirectoryOnly);
else filelist = args;

foreach (string filename in filelist)
{
Encoding enc = Encoding.Default;
using (FileStream fs = File.OpenRead(filename))
{
byte[] data = new byte[3];
while (fs.Read(data, 0, data.Length) > 0)
if (data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf)
{
enc = Encoding.UTF8;
break;
}
else
{
enc = Encoding.GetEncoding(1251);
break;
}
}

Console.Write("loading {0}..\n", filename);
string[] input = File.ReadAllLines(filename, enc);
if (input.Length > 0)
{
Console.Write("sorting..\n");
Array.Sort(input);
List<string> uniques = new List<string>();
List<string> duplicates = new List<string>();
Console.Write("deleting duplicates..\n");
uniques.Add(input[0]);
for (int i = 1; i < input.Length; i++)
if (input[i] != input[i - 1])
uniques.Add(input[i]);
else
duplicates.Add(input[i]);

Console.Write("\nsaving..\n");
File.WriteAllLines(path + GetFilename(Path.GetFileNameWithoutExtension(filename), true), uniques.ToArray(), enc);
File.WriteAllLines(path + GetFilename(Path.GetFileNameWithoutExtension(filename), false), duplicates.ToArray(), enc);
Console.Write("\n\n");
}
}
}
}
}


See More: Removing duplicate text in a txt file using bat

Reply ↓  Report •

✔ Best Answer
December 1, 2018 at 16:43:23
The problem is the sorting isn't a feature of the program; it's how the duplicates are found. The program reads everything, sorts it, and then goes down through the lines, checking to see if the previous line is the same as the current line. To get rid of the sorting, you'd have to come up with a different method for deduplication. So I did. I didn't bother with the file holding the duplicates, since you didn't seem that interested in it.
/*
@echo off && cls
set WinDirNet=%WinDir%\Microsoft.NET\Framework
IF EXIST "%WinDirNet%\v2.0.50727\csc.exe" set csc="%WinDirNet%\v2.0.50727\csc.exe"
IF EXIST "%WinDirNet%\v3.5\csc.exe" set csc="%WinDirNet%\v3.5\csc.exe"
IF EXIST "%WinDirNet%\v4.0.30319\csc.exe" set csc="%WinDirNet%\v4.0.30319\csc.exe"
%csc% /nologo /out:"%~0.exe" %0
"%~0.exe" %1
del "%~0.exe"
exit /b
*/

//14 mar 2015 @ 13:29
//metaspamer.blogspot.com

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace spicemustflow_sortcleaner {
  class Program {
    static string GetFilename(string nameWithoutExtension) {
      int n = 1;
      string outputFile = string.Empty;
      while (true) {
        outputFile = string.Format("{0}_uniques{1}.txt",
          nameWithoutExtension, n > 1 ? n.ToString() : string.Empty);
        if (File.Exists(outputFile)) n++;
        else break;
      }
      return outputFile;
    }

    static void Main(string[] args) {
      string path = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\";
      string[] filelist = null;
      if (args.Length == 0) filelist = Directory.GetFiles(path, "*.txt", SearchOption.TopDirectoryOnly);
      else filelist = args;

      foreach (string filename in filelist) {
        Encoding enc = Encoding.Default;
        using (FileStream fs = File.OpenRead(filename)) {
          byte[] data = new byte[3];
          while (fs.Read(data, 0, data.Length) > 0)
            if (data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
              enc = Encoding.UTF8;
              break;
            } else {
              enc = Encoding.GetEncoding(1251);
              break;
            }
        }

        using (StreamReader inFile = new StreamReader(filename, enc))
        using (StreamWriter outFile = new StreamWriter(path + 
                GetFilename(Path.GetFileNameWithoutExtension(filename)), false, enc)) {
          Console.WriteLine("Deduping {0}..", filename);
          HashSet<string> prevLines = new HashSet<string>();
          while (!inFile.EndOfStream) {
            string line = inFile.ReadLine();
            if (prevLines.Add(line))
              outFile.WriteLine(line);
          }
        }
      }
    }
  }
}

How To Ask Questions The Smart Way



#1
December 1, 2018 at 08:21:22
I can try, but it will take me quite some time.

i5-6600K[delid]@4.8GHz/4.4GHz@1.424v LLC=6 | 2x4GB Crucial-DDR4-2133CL15@14-14-14-30 1T 2800MHz@1.37v
MSI Armor RX 570 4GB@1415Mhz core@1.2v/1920MHz


Reply ↓  Report •

#2
December 1, 2018 at 09:36:39
eeeeh yeah encountered some issues, might try again on monday.

this is not working yet:

@echo off&setlocal enabledelayedexpansion
set "file=%~1"
set outfile=output.txt
:MAIN
for /f "tokens=*" %%a in ('type "!file!" ^| find /v ""') do call :CHECK "%%a"
exit/b
:CHECK
set duplicates=0
set "line=%~1"
for /f "tokens=*" %%a in ('type "!file!" ^| find "%~1"') do if "%%a"=="!line!" set/a duplicates+=1
if !duplicates!==1 (echo !line!>>!outfile!)
::echos all lines+duplicates, but removes lines that contain a % for some reason
exit/b
:KILLME
if !duplicates!==0 (echo !line!>>!outfile!)
::echos everything with a % in it....
if !duplicates! LEQ 1 (echo !line!>>!outfile!)
::echos prints everything except the duplicates, it will remove both the original line + duplicate, also removes all lines with % in it
if !duplicates! GEQ 2 (echo !line!>>!outfile!)
::echos all the duplicates

i5-6600K[delid]@4.8GHz/4.4GHz@1.424v LLC=6 | 2x4GB Crucial-DDR4-2133CL15@14-14-14-30 1T 2800MHz@1.37v
MSI Armor RX 570 4GB@1415Mhz core@1.2v/1920MHz


Reply ↓  Report •

#3
December 1, 2018 at 10:24:30
Dear hidde663. Thank You for help! I do not have idea about script. Just using it to delete duplicates in my email data.It works fine. Only problem it sort data alphabeticalli in output file. I need save output data same way as it in original. Is there way remove or replace that symbols inside script to stop that sorting alphabetically function.I am sorry if say any wrong about changes.
Sincerely.

Reply ↓  Report •

Related Solutions

#4
December 1, 2018 at 16:43:23
✔ Best Answer
The problem is the sorting isn't a feature of the program; it's how the duplicates are found. The program reads everything, sorts it, and then goes down through the lines, checking to see if the previous line is the same as the current line. To get rid of the sorting, you'd have to come up with a different method for deduplication. So I did. I didn't bother with the file holding the duplicates, since you didn't seem that interested in it.
/*
@echo off && cls
set WinDirNet=%WinDir%\Microsoft.NET\Framework
IF EXIST "%WinDirNet%\v2.0.50727\csc.exe" set csc="%WinDirNet%\v2.0.50727\csc.exe"
IF EXIST "%WinDirNet%\v3.5\csc.exe" set csc="%WinDirNet%\v3.5\csc.exe"
IF EXIST "%WinDirNet%\v4.0.30319\csc.exe" set csc="%WinDirNet%\v4.0.30319\csc.exe"
%csc% /nologo /out:"%~0.exe" %0
"%~0.exe" %1
del "%~0.exe"
exit /b
*/

//14 mar 2015 @ 13:29
//metaspamer.blogspot.com

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace spicemustflow_sortcleaner {
  class Program {
    static string GetFilename(string nameWithoutExtension) {
      int n = 1;
      string outputFile = string.Empty;
      while (true) {
        outputFile = string.Format("{0}_uniques{1}.txt",
          nameWithoutExtension, n > 1 ? n.ToString() : string.Empty);
        if (File.Exists(outputFile)) n++;
        else break;
      }
      return outputFile;
    }

    static void Main(string[] args) {
      string path = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\";
      string[] filelist = null;
      if (args.Length == 0) filelist = Directory.GetFiles(path, "*.txt", SearchOption.TopDirectoryOnly);
      else filelist = args;

      foreach (string filename in filelist) {
        Encoding enc = Encoding.Default;
        using (FileStream fs = File.OpenRead(filename)) {
          byte[] data = new byte[3];
          while (fs.Read(data, 0, data.Length) > 0)
            if (data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
              enc = Encoding.UTF8;
              break;
            } else {
              enc = Encoding.GetEncoding(1251);
              break;
            }
        }

        using (StreamReader inFile = new StreamReader(filename, enc))
        using (StreamWriter outFile = new StreamWriter(path + 
                GetFilename(Path.GetFileNameWithoutExtension(filename)), false, enc)) {
          Console.WriteLine("Deduping {0}..", filename);
          HashSet<string> prevLines = new HashSet<string>();
          while (!inFile.EndOfStream) {
            string line = inFile.ReadLine();
            if (prevLines.Add(line))
              outFile.WriteLine(line);
          }
        }
      }
    }
  }
}

How To Ask Questions The Smart Way


Reply ↓  Report •

#5
December 1, 2018 at 17:43:56
nice      

i5-6600K[delid]@4.8GHz/4.4GHz@1.424v LLC=6 | 2x4GB Crucial-DDR4-2133CL15@14-14-14-30 1T 2800MHz@1.37v
MSI Armor RX 570 4GB@1415Mhz core@1.2v/1920MHz


Reply ↓  Report •

Ask Question