Please check this find/rm script I'm about to run as root

A

Amaranth

I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.

I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.

================================================
// find_duplicates.cpp
#include <iostream>
#include <fstream>
#include <cstdlib>

using std::string;
using std::fstream;
using std::cout;
using std::cin;
using std::eek:fstream;
using std::ios;
using std::flush;

int main()
{
system("rm full_rm.script; touch full_rm.script");
// Find all files above 100MB
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $(NF-3), $(NF-2), $(NF-1), $NF}' > find_duplicates.txt" );
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $NF}' | sed 's/\\// /g' | awk '{print $NF}' >
find_duplicates2.txt" );
system("paste find_duplicates.txt find_duplicates2.txt | sort +4 -5 >
allfind.txt");
system("rm find_duplicates.txt find_duplicates2.txt");

// Find out number of duplicates
system("awk '{print $NF}' allfind.txt | uniq -d >
redundant_files.txt");
system("wc -l redundant_files.txt | awk '{print $1}' >
filelines.txt");
fstream get_filelines;
get_filelines.open("filelines.txt");
string inputline;
getline(get_filelines, inputline);
int filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");

int i, j;
ofstream create_script;
ofstream create_rm_script;
fstream read_redundant_files;
read_redundant_files.open("redundant_files.txt");
system("cat redundant_files.txt");
fstream create_comments;
fstream read_smaller_duplicates;
fstream read_raw_smaller_duplicates;
string duplicates;
string smaller_duplicates;
string raw_smaller_duplicates;
string raw_largest_duplicate;
for (i = 0; i < filelines; i++)
{
// List all duplicates
getline(read_redundant_files, duplicates);
// cout << i << "\t" << duplicates << "\n";
create_script.open("list_duplicates.script");
create_script << "grep " << duplicates << " allfind.txt | sort +0 -1
list_duplicates" << "\n";
create_script.close();
// system("cat list_duplicates.script");
system("chmod 700 list_duplicates.script");
system("./list_duplicates.script");

// Split duplicates into smaller & largest one.
// Smaller duplicates are deleted
// Largest duplicate is kept
system("wc -l list_duplicates | awk '{print $1}' > filelines.txt");
get_filelines.open("filelines.txt");
getline(get_filelines, inputline);
int duplicate_filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
create_script.open("split_duplicates.script");
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $1, $2, $3, $4}' >
raw_smaller_duplicates" << "\n";
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $4}' > smaller_duplicates" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $1, $2, $3,
$4}' > raw_largest_duplicate" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $4}' >
largest_duplicate" << "\n";
create_script.close();
system("chmod 700 split_duplicates.script");
system("./split_duplicates.script");

// Create rm script
system("cat raw_smaller_duplicates");
system("cat smaller_duplicates");
system("cat raw_largest_duplicate");
system("cat largest_duplicate");

read_smaller_duplicates.open("smaller_duplicates");
read_raw_smaller_duplicates.open("raw_smaller_duplicates");
create_rm_script.open("rm.script");
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_raw_smaller_duplicates,raw_smaller_duplicates);
create_rm_script << "# " << raw_smaller_duplicates << "\n";
}
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_smaller_duplicates,smaller_duplicates);
create_rm_script << "rm -v " << smaller_duplicates << "\n";
}
read_smaller_duplicates.close();
read_raw_smaller_duplicates.close();

create_comments.open("raw_largest_duplicate");
getline(create_comments,raw_largest_duplicate);
create_rm_script << "# " << raw_largest_duplicate << "\n";
create_rm_script << "\n";
create_comments.close();
create_rm_script.close();
system("cat rm.script full_rm.script > temp_rm.script");
system("mv temp_rm.script full_rm.script");
system("rm rm.script");
}

read_redundant_files.close();
system("rm raw_largest_duplicate largest_duplicate
raw_smaller_duplicates smaller_duplicates split_duplicates.script
list_duplicates list_duplicates.script redundant_files.txt");
system("chmod 700 full_rm.script");
system("./full_rm.script");

}
 
A

Amaranth

I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.

I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.

================================================
// find_duplicates.cpp
#include <iostream>
#include <fstream>
#include <cstdlib>

using std::string;
using std::fstream;
using std::cout;
using std::cin;
using std::eek:fstream;
using std::ios;
using std::flush;

int main()
{
system("rm full_rm.script; touch full_rm.script");
// Find all files above 100MB
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $(NF-3), $(NF-2), $(NF-1), $NF}' > find_duplicates.txt" );
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $NF}' | sed 's/\\// /g' | awk '{print $NF}' >
find_duplicates2.txt" );
system("paste find_duplicates.txt find_duplicates2.txt | sort +4 -5 >
allfind.txt");
system("rm find_duplicates.txt find_duplicates2.txt");

// Find out number of duplicates
system("awk '{print $NF}' allfind.txt | uniq -d >
redundant_files.txt");
system("wc -l redundant_files.txt | awk '{print $1}' >
filelines.txt");
fstream get_filelines;
get_filelines.open("filelines.txt");
string inputline;
getline(get_filelines, inputline);
int filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");

int i, j;
ofstream create_script;
ofstream create_rm_script;
fstream read_redundant_files;
read_redundant_files.open("redundant_files.txt");
system("cat redundant_files.txt");
fstream create_comments;
fstream read_smaller_duplicates;
fstream read_raw_smaller_duplicates;
string duplicates;
string smaller_duplicates;
string raw_smaller_duplicates;
string raw_largest_duplicate;
for (i = 0; i < filelines; i++)
        {
        // List all duplicates
        getline(read_redundant_files, duplicates);
//      cout << i << "\t" << duplicates << "\n";
        create_script.open("list_duplicates.script");
        create_script << "grep " << duplicates << " allfind.txt | sort +0 -1> list_duplicates" << "\n";

        create_script.close();
//      system("cat list_duplicates.script");
        system("chmod 700 list_duplicates.script");
        system("./list_duplicates.script");

        // Split duplicates into smaller & largest one.
        // Smaller duplicates are deleted
        // Largest duplicate is kept
        system("wc -l list_duplicates | awk '{print $1}' > filelines.txt");
        get_filelines.open("filelines.txt");
        getline(get_filelines, inputline);
        int duplicate_filelines = atoi(inputline.c_str());
        get_filelines.close();
        system("rm filelines.txt");
        create_script.open("split_duplicates.script");
        create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $1, $2, $3, $4}' >
raw_smaller_duplicates" << "\n";
        create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $4}' > smaller_duplicates" << "\n";
        create_script << "tail -1 list_duplicates | awk '{print $1, $2, $3,
$4}' > raw_largest_duplicate" << "\n";
        create_script << "tail -1 list_duplicates | awk '{print $4}' >
largest_duplicate" << "\n";
        create_script.close();
        system("chmod 700 split_duplicates.script");
        system("./split_duplicates.script");

        // Create rm script
        system("cat raw_smaller_duplicates");
        system("cat smaller_duplicates");
        system("cat raw_largest_duplicate");
        system("cat largest_duplicate");

        read_smaller_duplicates.open("smaller_duplicates");
        read_raw_smaller_duplicates.open("raw_smaller_duplicates");
        create_rm_script.open("rm.script");
        for (j = 0; j < duplicate_filelines - 1; j++)
                {
                getline(read_raw_smaller_duplicates,raw_smaller_duplicates);
                create_rm_script << "# " << raw_smaller_duplicates << "\n";
                }
        for (j = 0; j < duplicate_filelines - 1; j++)
                {
                getline(read_smaller_duplicates,smaller_duplicates);
                create_rm_script << "rm -v " << smaller_duplicates << "\n";
                }
        read_smaller_duplicates.close();
        read_raw_smaller_duplicates.close();

        create_comments.open("raw_largest_duplicate");
        getline(create_comments,raw_largest_duplicate);
        create_rm_script << "# " << raw_largest_duplicate << "\n";
        create_rm_script << "\n";
        create_comments.close();
        create_rm_script.close();
        system("cat rm.script full_rm.script > temp_rm.script");
        system("mv temp_rm.script full_rm.script");
        system("rm rm.script");
        }

read_redundant_files.close();
system("rm raw_largest_duplicate largest_duplicate
raw_smaller_duplicates smaller_duplicates split_duplicates.script
list_duplicates list_duplicates.script redundant_files.txt");
system("chmod 700 full_rm.script");
system("./full_rm.script");

}

Hmm, the program doesn't work when the directories have spaces in
their name (due to awk).

Any advice?
 
A

Alf P. Steinbach

* Amaranth:
I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.

I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

There's a lot of 'system' calls.

It seems that this program would be better expressed as a shell script.

Using C++ to do the individual statements of a script is just to complicate things.


Cheers & hth.,

- Alf
 
X

Xavier Roche

Amaranth a écrit :
Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

Sorry, but the code is unreadable. Very complicated, and many system()
calls that should be replaced by clean code.

A simple recursive readdir() handling, with clever stat() to avoid
infinite loops, might be a good start.

-or-, a shell script might do the trick, too.

BTW-

"I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them."

What do you mean by "delete all but the largest of
them" ? If the files are "redundant" (identical), their size are
identical, idn't it ?
 
U

Unruh

Amaranth said:
I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.

No idea what a "redundant" file is. I do not think you have thought
things through.


I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.

I would be too. What in the world are you hoping to accomplish? Tell us
the problem not your solution.

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

Who cares? The idea itself is ill thought out.

A wishlist for the program is to only delete the smaller files if they

So you want /bin/rm deleted because /usr/bin/awk is bigger?
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.
================================================
// find_duplicates.cpp
#include <iostream>
#include <fstream>
#include <cstdlib>
using std::string;
using std::fstream;
using std::cout;
using std::cin;
using std::eek:fstream;
using std::ios;
using std::flush;
int main()
{
system("rm full_rm.script; touch full_rm.script");
// Find all files above 100MB
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $(NF-3), $(NF-2), $(NF-1), $NF}' > find_duplicates.txt" );
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $NF}' | sed 's/\\// /g' | awk '{print $NF}' >
find_duplicates2.txt" );
system("paste find_duplicates.txt find_duplicates2.txt | sort +4 -5 >
allfind.txt");
system("rm find_duplicates.txt find_duplicates2.txt");

And you are using cpp to run a shell script why? This is totally silly.


// Find out number of duplicates
system("awk '{print $NF}' allfind.txt | uniq -d >
redundant_files.txt");
system("wc -l redundant_files.txt | awk '{print $1}' >
filelines.txt");
fstream get_filelines;
get_filelines.open("filelines.txt");
string inputline;
getline(get_filelines, inputline);
int filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
int i, j;
ofstream create_script;
ofstream create_rm_script;
fstream read_redundant_files;
read_redundant_files.open("redundant_files.txt");
system("cat redundant_files.txt");
fstream create_comments;
fstream read_smaller_duplicates;
fstream read_raw_smaller_duplicates;
string duplicates;
string smaller_duplicates;
string raw_smaller_duplicates;
string raw_largest_duplicate;
for (i = 0; i < filelines; i++)
{
// List all duplicates
getline(read_redundant_files, duplicates);
// cout << i << "\t" << duplicates << "\n";
create_script.open("list_duplicates.script");
create_script << "grep " << duplicates << " allfind.txt | sort +0 -1
create_script.close();
// system("cat list_duplicates.script");
system("chmod 700 list_duplicates.script");
system("./list_duplicates.script");
// Split duplicates into smaller & largest one.
// Smaller duplicates are deleted
// Largest duplicate is kept
system("wc -l list_duplicates | awk '{print $1}' > filelines.txt");
get_filelines.open("filelines.txt");
getline(get_filelines, inputline);
int duplicate_filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
create_script.open("split_duplicates.script");
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $1, $2, $3, $4}' >
raw_smaller_duplicates" << "\n";
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $4}' > smaller_duplicates" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $1, $2, $3,
$4}' > raw_largest_duplicate" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $4}' >
largest_duplicate" << "\n";
create_script.close();
system("chmod 700 split_duplicates.script");
system("./split_duplicates.script");
// Create rm script
system("cat raw_smaller_duplicates");
system("cat smaller_duplicates");
system("cat raw_largest_duplicate");
system("cat largest_duplicate");
read_smaller_duplicates.open("smaller_duplicates");
read_raw_smaller_duplicates.open("raw_smaller_duplicates");
create_rm_script.open("rm.script");
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_raw_smaller_duplicates,raw_smaller_duplicates);
create_rm_script << "# " << raw_smaller_duplicates << "\n";
}
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_smaller_duplicates,smaller_duplicates);
create_rm_script << "rm -v " << smaller_duplicates << "\n";
}
read_smaller_duplicates.close();
read_raw_smaller_duplicates.close();
create_comments.open("raw_largest_duplicate");
getline(create_comments,raw_largest_duplicate);
create_rm_script << "# " << raw_largest_duplicate << "\n";
create_rm_script << "\n";
create_comments.close();
create_rm_script.close();
system("cat rm.script full_rm.script > temp_rm.script");
system("mv temp_rm.script full_rm.script");
system("rm rm.script");
}
 
A

Amaranth

There's a lot of 'system' calls.

It seems that this program would be better expressed as a shell script.

Using C++ to do the individual statements of a script is just to complicate things.

Noted, but my capabilities with shell scripts is extremely limited. I
could do what I needed to do with my current knowledge of C++ and
shell scripts, but not exclusively with either.
 
A

Amaranth

Amaranth a écrit :


Sorry, but the code is unreadable. Very complicated, and many system()
calls that should be replaced by clean code.

A simple recursive readdir() handling, with clever stat() to avoid
infinite loops, might be a good start.

-or-, a shell script might do the trick, too.

BTW-

"I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them."

What do you mean by "delete all but the largest of
them" ? If the files are "redundant" (identical), their size are
identical, idn't it ?

Maybe not the best usage of the word redundant but some files (like
tar.gz backups) were made at different times. Sloppy archival but they
might have the same name. Obviously I'd only want to keep the newest
&& largest one.

Other redundant files are just that - they are exactly the same and
I'd like to delete all but one of them.
 
A

Amaranth

No idea what a "redundant" file is. I do not think you have thought
things through.


I would be too. What in the world are you hoping to accomplish? Tell us
the problem not your solution.

I'm trying to find duplicate files across a few hard drives, and
delete all but the newest && biggest. The duplicate files may differ
in size and modified times (sloppy archival).
Who cares? The idea itself is ill thought out.


So you want /bin/rm deleted because /usr/bin/awk is bigger?

I'd be running it from the /media/ directory in SuSE Linux, under the
root folder. All subdirectories in /media will be the contents of
plugged-in backup hard drives, and not system files, which will not be
touched.
 
J

Jules

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

It looks badly like something that either needs to be a shell script or a
C++ program, but not both - or, at the very least, something that needs to
be mostly a shell script which calls a binary (which could be C++) to do
some of the complex processing.

Having a C++ program making a bunch of calls to shell utils just seems
really messy to me.
 
J

Jules

Hmm, the program doesn't work when the directories have spaces in
their name (due to awk).

Any advice?

A few possibles:

1) Put all the non-filename fields first on a line before the filename,
separated by a character which you know won't appear in the non-filename
fields. Then you should be able to split the line after this 'special'
char (anything after the special char is considered filename), either
using awk or some messing around with expr.

2) write a bash shell function to escape all spaces in the filename using
sed, replacing with some other character (e.g. replace all occurences of
'_' with '__', then all occurences of ' ' with '_' - a similar shell
function can do the reverse)

3) Just code the whole lot in C or C++, using a combination of opendir(),
readdir() and stat() calls to walk the filesystem and figure out what
files you're interested in; build the whole lot up into a sorted list,
then use unlink() calls to delete whatever's needed.

Notes:

1) newlines are technically legal in filenames I believe (only null and
'/' aren't), which could really screw any line-based processing up. Very
uncommon to see a newline in a filename, but it *could* happen.

2) I still do a bit of C, but last did any C++ well over 10 years ago - I
don't know if there are more 'OO-friendly' versions of opendir etc. that
you'd want to use...

cheers

Jules
 
A

Amaranth

It looks badly like something that either needs to be a shell script or a
C++ program, but not both - or, at the very least, something that needs to
be mostly a shell script which calls a binary (which could be C++) to do
some of the complex processing.

But then you'd have two mutually dependent programs which you'd have
to deploy together, rather than a single C++ program (which codes for
scripts within itself).
Having a C++ program making a bunch of calls to shell utils just seems
really messy to me.

I don't use shell scripts often enough to be able to justify learning
scripting in any detail, or retain what I would learn for the purposes
of this archival clean-up job.

I'm a mechanical engineer, not a programmer. I did what I needed to do
with a Frankenstein of scripts/C++. It may be messy but I still have
not seen any alternatives to it as shell scripts yet...not that I am
doubting it can not be done - I asked how to do it with scripts a
while back but got no takers. So I did it the way I knew how.
 
A

Amaranth

And if there are two files, one newer and the other larger?

Then it prepares the rm script anyway but flags a warning. Perhaps put
a special character in the vicinity for me to quickly find the problem
in the created rm script. The user can then go to those instances and
make a decision based on his/her discretion.
So for each file name you want to delete all but the newest?  Or all but
the largest?

The program should flag no problems if the smaller files are also
older. If the largest file is older than any of the smaller ones, then
it should give a warning and let the user decide.

I re-mounted my /media drives as read-only and made sure the program
created the rm script, but stopped short of executing it. Running the
program as root found about 300 instances of duplicate files with size
larger than 100MB.
 
A

Amaranth

I don't think you ever explained clearly what the problem is.

1) Recursively search all directories for files with a matching name.
2) If files with a matching name are found, then:
- Delete files only if they are both smaller && older
- Otherwise give the user a warning
You tried it and it worked, then?

It didn't work when directories/filenames had spaces in them. I'm
trying to decide whether to just manually fix those instances or write
a fix. Also I'm trying to decide whether archival redundancy over time
(e.g. backups taken over time the work was conducted) is good policy
in itself or if I could use the space better by having mirror copies
of the most up-to-date archives.

Also I'm very interested to see how a script can accomplish the same
thing. It'd be an educational exercise in learning by example, to say
the least.
 
D

despen

Amaranth said:
I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.

I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.

================================================
// find_duplicates.cpp
#include <iostream>


Aggh! Totally wrong language.

First you should better explain what you are up to.

Then you need to pick a better language.

For simple things, a shell variant like bash is OK.
For something like this Perl is the way to go.
All that stuff like removing files getting file size
spaces in files names is easily handled with native perl
functions.

Here's the perl "find" module:

http://perldoc.perl.org/File/Find.html
 
U

Unruh

Maybe not the best usage of the word redundant but some files (like
tar.gz backups) were made at different times. Sloppy archival but they
might have the same name. Obviously I'd only want to keep the newest
&& largest one.
Other redundant files are just that - they are exactly the same and
I'd like to delete all but one of them.

No. Why should files in diffeent directories but having the same name be
"the same"? Thus it could well be that may directories have files called
config. They are the config files for different programs.
And if they are in the same directory they cannot have the same name.
You are just confused and threatening to destroy your system by your
confusion.
 
U

Unruh

1) newlines are technically legal in filenames I believe (only null and
'/' aren't), which could really screw any line-based processing up. Very
uncommon to see a newline in a filename, but it *could* happen.

Yes, one of my son's programs has file names with newlines in them.
totally messes up my backup program. totally idiotic. But there it is
 
A

Amaranth

No. Why should files in diffeent directories but having the same name be
"the same"? Thus it could well be that may directories have files called
config. They are the config files for different programs.

I of course plan to be checking the remove script before running it in
case of this.
And if they are in the same directory they cannot have the same name.
You are just confused and threatening to destroy your system by your
confusion.

All the files in the /media subdirectory in SuSE Linux belong to
attached drives and are unrelated to system files.
 
N

Nathan Keel

Amaranth said:
I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.

I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.

What makes you think they are redundant, just because it's the same size
and name? Not that it's unreasonable to think they are, but why not do
a file sum check to determine it? Also, why such a large C++ program
for this task? A couple of find commands, which you execute from that
program anyway and maybe a few lines or shell code, would accomplish
the same thing.
 
J

Jakub Fišer

please don't abuse C, what you made here is a shell script, not a C programme.

Don't take this personally though, I don't mean to insult you in any way...

Use shell script for this task, it's quite simple, you've done most of the work
in the code you provided, just do the rest in bash.

You can then deploy a single shell script.

If think for you something like this one simple line should work:

rm `find /mnt/bigfiles -size +100M -exec stat -c "%s '%n'" "{}" \; | sort -n |\
head -n -1 | cut -d" " -f2-`

-miky




I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.

I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.

Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?

A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.

================================================
// find_duplicates.cpp
#include <iostream>
#include <fstream>
#include <cstdlib>

using std::string;
using std::fstream;
using std::cout;
using std::cin;
using std::eek:fstream;
using std::ios;
using std::flush;

int main()
{
system("rm full_rm.script; touch full_rm.script");
// Find all files above 100MB
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $(NF-3), $(NF-2), $(NF-1), $NF}' > find_duplicates.txt" );
system("find /media/ -type f -size +100M -exec ls -l {} \\; | awk
'{print $NF}' | sed 's/\\// /g' | awk '{print $NF}' >
find_duplicates2.txt" );
system("paste find_duplicates.txt find_duplicates2.txt | sort +4 -5 >
allfind.txt");
system("rm find_duplicates.txt find_duplicates2.txt");

// Find out number of duplicates
system("awk '{print $NF}' allfind.txt | uniq -d >
redundant_files.txt");
system("wc -l redundant_files.txt | awk '{print $1}' >
filelines.txt");
fstream get_filelines;
get_filelines.open("filelines.txt");
string inputline;
getline(get_filelines, inputline);
int filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");

int i, j;
ofstream create_script;
ofstream create_rm_script;
fstream read_redundant_files;
read_redundant_files.open("redundant_files.txt");
system("cat redundant_files.txt");
fstream create_comments;
fstream read_smaller_duplicates;
fstream read_raw_smaller_duplicates;
string duplicates;
string smaller_duplicates;
string raw_smaller_duplicates;
string raw_largest_duplicate;
for (i = 0; i < filelines; i++)
{
// List all duplicates
getline(read_redundant_files, duplicates);
// cout << i << "\t" << duplicates << "\n";
create_script.open("list_duplicates.script");
create_script << "grep " << duplicates << " allfind.txt | sort +0 -1
create_script.close();
// system("cat list_duplicates.script");
system("chmod 700 list_duplicates.script");
system("./list_duplicates.script");

// Split duplicates into smaller & largest one.
// Smaller duplicates are deleted
// Largest duplicate is kept
system("wc -l list_duplicates | awk '{print $1}' > filelines.txt");
get_filelines.open("filelines.txt");
getline(get_filelines, inputline);
int duplicate_filelines = atoi(inputline.c_str());
get_filelines.close();
system("rm filelines.txt");
create_script.open("split_duplicates.script");
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $1, $2, $3, $4}' >
raw_smaller_duplicates" << "\n";
create_script << "head -" << duplicate_filelines - 1 << "
list_duplicates | awk '{print $4}' > smaller_duplicates" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $1, $2, $3,
$4}' > raw_largest_duplicate" << "\n";
create_script << "tail -1 list_duplicates | awk '{print $4}' >
largest_duplicate" << "\n";
create_script.close();
system("chmod 700 split_duplicates.script");
system("./split_duplicates.script");

// Create rm script
system("cat raw_smaller_duplicates");
system("cat smaller_duplicates");
system("cat raw_largest_duplicate");
system("cat largest_duplicate");

read_smaller_duplicates.open("smaller_duplicates");
read_raw_smaller_duplicates.open("raw_smaller_duplicates");
create_rm_script.open("rm.script");
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_raw_smaller_duplicates,raw_smaller_duplicates);
create_rm_script << "# " << raw_smaller_duplicates << "\n";
}
for (j = 0; j < duplicate_filelines - 1; j++)
{
getline(read_smaller_duplicates,smaller_duplicates);
create_rm_script << "rm -v " << smaller_duplicates << "\n";
}
read_smaller_duplicates.close();
read_raw_smaller_duplicates.close();

create_comments.open("raw_largest_duplicate");
getline(create_comments,raw_largest_duplicate);
create_rm_script << "# " << raw_largest_duplicate << "\n";
create_rm_script << "\n";
create_comments.close();
create_rm_script.close();
system("cat rm.script full_rm.script > temp_rm.script");
system("mv temp_rm.script full_rm.script");
system("rm rm.script");
}

read_redundant_files.close();
system("rm raw_largest_duplicate largest_duplicate
raw_smaller_duplicates smaller_duplicates split_duplicates.script
list_duplicates list_duplicates.script redundant_files.txt");
system("chmod 700 full_rm.script");
system("./full_rm.script");

}


--
Jakub Fišer AKA (e-mail address removed)
ICQ: I don't see kyou - http://icq.xmpp.cz/
JID: (e-mail address removed)

Vyhýbejte se, prosím, přílohám typu Word nebo PowerPoint:
http://www.gnu.org/philosophy/no-word-attachments.cs.html

Please avoid sending me Word, PowerPoint, etc. attachments:
http://www.gnu.org/philosophy/no-word-attachments.html
 
D

DenverD

Amaranth said:
I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.<snip>

imo buying more disk space is cheaper, easier and quicker than
accidentally destroying your system OR accidentally deleting either
your doctoral thesis before delivery to the committee, or your p0rn
collection..

so, how many hours have you spent on this script so far?
and, how much is your time worth?

i'd guess it might take one or two hours TOTAL to locate (on line) a
plug-in-MORE-SPACE-solution, unpack and hook it up...and breathe easy..

THINK!
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,755
Messages
2,569,536
Members
45,009
Latest member
GidgetGamb

Latest Threads

Top