A
Amaranth
I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.
I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.
Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?
A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.
================================================
// find_duplicates.cpp
#include <iostream>
#include <fstream>
#include <cstdlib>
using std::string;
using std::fstream;
using std::cout;
using std::cin;
using std:
fstream;
using std::ios;
using std::flush;
int main()
{
system("rm full_rm.script; touch full_rm.script");
// Find all files above 100MB
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $(NF-3), $(NF-2), $(NF-1), $NF}' > find_duplicates.txt" );
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $NF}' | sed 's/\\// /g' | awk '{print $NF}' >
find_duplicates2.txt" );
system("paste find_duplicates.txt find_duplicates2.txt | sort +4 -5 >
allfind.txt");
system("rm find_duplicates.txt find_duplicates2.txt");
// Find out number of duplicates
system("awk '{print $NF}' allfind.txt | uniq -d >
redundant_files.txt");
system("wc -l redundant_files.txt | awk '{print $1}' >
filelines.txt");
fstream get_filelines;
get_filelines.open("filelines.txt");
string inputline;
getline(get_filelines, inputline);
int filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
int i, j;
ofstream create_script;
ofstream create_rm_script;
fstream read_redundant_files;
read_redundant_files.open("redundant_files.txt");
system("cat redundant_files.txt");
fstream create_comments;
fstream read_smaller_duplicates;
fstream read_raw_smaller_duplicates;
string duplicates;
string smaller_duplicates;
string raw_smaller_duplicates;
string raw_largest_duplicate;
for (i = 0; i < filelines; i++)
{
// List all duplicates
getline(read_redundant_files, duplicates);
// cout << i << "\t" << duplicates << "\n";
create_script.open("list_duplicates.script");
create_script << "grep " << duplicates << " allfind.txt | sort +0 -1
// system("cat list_duplicates.script");
system("chmod 700 list_duplicates.script");
system("./list_duplicates.script");
// Split duplicates into smaller & largest one.
// Smaller duplicates are deleted
// Largest duplicate is kept
system("wc -l list_duplicates | awk '{print $1}' > filelines.txt");
get_filelines.open("filelines.txt");
getline(get_filelines, inputline);
int duplicate_filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
create_script.open("split_duplicates.script");
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $1, $2, $3, $4}' >
raw_smaller_duplicates" << "\n";
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $4}' > smaller_duplicates" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $1, $2, $3,
$4}' > raw_largest_duplicate" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $4}' >
largest_duplicate" << "\n";
create_script.close();
system("chmod 700 split_duplicates.script");
system("./split_duplicates.script");
// Create rm script
system("cat raw_smaller_duplicates");
system("cat smaller_duplicates");
system("cat raw_largest_duplicate");
system("cat largest_duplicate");
read_smaller_duplicates.open("smaller_duplicates");
read_raw_smaller_duplicates.open("raw_smaller_duplicates");
create_rm_script.open("rm.script");
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_raw_smaller_duplicates,raw_smaller_duplicates);
create_rm_script << "# " << raw_smaller_duplicates << "\n";
}
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_smaller_duplicates,smaller_duplicates);
create_rm_script << "rm -v " << smaller_duplicates << "\n";
}
read_smaller_duplicates.close();
read_raw_smaller_duplicates.close();
create_comments.open("raw_largest_duplicate");
getline(create_comments,raw_largest_duplicate);
create_rm_script << "# " << raw_largest_duplicate << "\n";
create_rm_script << "\n";
create_comments.close();
create_rm_script.close();
system("cat rm.script full_rm.script > temp_rm.script");
system("mv temp_rm.script full_rm.script");
system("rm rm.script");
}
read_redundant_files.close();
system("rm raw_largest_duplicate largest_duplicate
raw_smaller_duplicates smaller_duplicates split_duplicates.script
list_duplicates list_duplicates.script redundant_files.txt");
system("chmod 700 full_rm.script");
system("./full_rm.script");
}
redundant files larger than 100MB and delete all but the largest of
them.
I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.
Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?
A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.
================================================
// find_duplicates.cpp
#include <iostream>
#include <fstream>
#include <cstdlib>
using std::string;
using std::fstream;
using std::cout;
using std::cin;
using std:
using std::ios;
using std::flush;
int main()
{
system("rm full_rm.script; touch full_rm.script");
// Find all files above 100MB
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $(NF-3), $(NF-2), $(NF-1), $NF}' > find_duplicates.txt" );
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $NF}' | sed 's/\\// /g' | awk '{print $NF}' >
find_duplicates2.txt" );
system("paste find_duplicates.txt find_duplicates2.txt | sort +4 -5 >
allfind.txt");
system("rm find_duplicates.txt find_duplicates2.txt");
// Find out number of duplicates
system("awk '{print $NF}' allfind.txt | uniq -d >
redundant_files.txt");
system("wc -l redundant_files.txt | awk '{print $1}' >
filelines.txt");
fstream get_filelines;
get_filelines.open("filelines.txt");
string inputline;
getline(get_filelines, inputline);
int filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
int i, j;
ofstream create_script;
ofstream create_rm_script;
fstream read_redundant_files;
read_redundant_files.open("redundant_files.txt");
system("cat redundant_files.txt");
fstream create_comments;
fstream read_smaller_duplicates;
fstream read_raw_smaller_duplicates;
string duplicates;
string smaller_duplicates;
string raw_smaller_duplicates;
string raw_largest_duplicate;
for (i = 0; i < filelines; i++)
{
// List all duplicates
getline(read_redundant_files, duplicates);
// cout << i << "\t" << duplicates << "\n";
create_script.open("list_duplicates.script");
create_script << "grep " << duplicates << " allfind.txt | sort +0 -1
create_script.close();list_duplicates" << "\n";
// system("cat list_duplicates.script");
system("chmod 700 list_duplicates.script");
system("./list_duplicates.script");
// Split duplicates into smaller & largest one.
// Smaller duplicates are deleted
// Largest duplicate is kept
system("wc -l list_duplicates | awk '{print $1}' > filelines.txt");
get_filelines.open("filelines.txt");
getline(get_filelines, inputline);
int duplicate_filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
create_script.open("split_duplicates.script");
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $1, $2, $3, $4}' >
raw_smaller_duplicates" << "\n";
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $4}' > smaller_duplicates" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $1, $2, $3,
$4}' > raw_largest_duplicate" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $4}' >
largest_duplicate" << "\n";
create_script.close();
system("chmod 700 split_duplicates.script");
system("./split_duplicates.script");
// Create rm script
system("cat raw_smaller_duplicates");
system("cat smaller_duplicates");
system("cat raw_largest_duplicate");
system("cat largest_duplicate");
read_smaller_duplicates.open("smaller_duplicates");
read_raw_smaller_duplicates.open("raw_smaller_duplicates");
create_rm_script.open("rm.script");
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_raw_smaller_duplicates,raw_smaller_duplicates);
create_rm_script << "# " << raw_smaller_duplicates << "\n";
}
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_smaller_duplicates,smaller_duplicates);
create_rm_script << "rm -v " << smaller_duplicates << "\n";
}
read_smaller_duplicates.close();
read_raw_smaller_duplicates.close();
create_comments.open("raw_largest_duplicate");
getline(create_comments,raw_largest_duplicate);
create_rm_script << "# " << raw_largest_duplicate << "\n";
create_rm_script << "\n";
create_comments.close();
create_rm_script.close();
system("cat rm.script full_rm.script > temp_rm.script");
system("mv temp_rm.script full_rm.script");
system("rm rm.script");
}
read_redundant_files.close();
system("rm raw_largest_duplicate largest_duplicate
raw_smaller_duplicates smaller_duplicates split_duplicates.script
list_duplicates list_duplicates.script redundant_files.txt");
system("chmod 700 full_rm.script");
system("./full_rm.script");
}