Remove repeated words from a file

A

arnuld

/* A C program that reads a file and copies the contents to a new file
while discarding all the repeated words.
* Written by one of my friends, posted by me on CLC for constructive
criticism. I dont' think its a standard
* C program, hence I posted it here to make it one :)
*
* VERSION 0.0
*
*/


#define __GNU__SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main ()
{
char str1[50] = {0};
char *array[100];
FILE *ifp,*ofp;
FILE *baseifp;


int i = 0,k=0,flag;
ifp = fopen("myfile", "r");
if(ifp==NULL)
perror("input File is not open");
ofp = fopen("outputfile", "w");
if(ofp==NULL)
perror("output File is not open");


char * line = NULL;
size_t len = 0;
ssize_t read;
/* print read elements on stdout */
while ((read = getline(&line, &len, ifp)) != -1) {
printf("Retrieved line of length %zu :\n", read);
printf("%s", line);
}
if (line)
free(line);
//fclose (ifp);
ifp = freopen("myfile", "r", ifp);

while(fscanf(ifp, "%s", str1)!=EOF)
{
printf ("%s\n",str1);
flag = 0;
array = (char *)malloc (strlen (str1)+1);
strcpy(array,str1);

if(i > 0)
for (k = 0; k < i ; k++)
{
if (strcmp(array[k], str1)==0)
{
flag = 1;
break;
}
}
if (flag == 0)
{
fprintf(ofp, "%s ", str1);
}
i++;
memset (str1, 0, 50);
}
printf ("\n");
fclose(ifp);
fclose(ofp);

return 0;
}

================== OUTPUT ========================

[arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
repeated-words.c
remove-repeated-words.c: In function ‘main’:
remove-repeated-words.c:33: error: ‘ssize_t’ undeclared (first use in
this function)
remove-repeated-words.c:33: error: (Each undeclared identifier is
reported only once
remove-repeated-words.c:33: error: for each function it appears in.)
remove-repeated-words.c:33: error: expected ‘;’ before ‘read’
remove-repeated-words.c:35: error: ‘read’ undeclared (first use in this
function)
remove-repeated-words.c:35: warning: implicit declaration of function
‘getline’
remove-repeated-words.c:19: warning: unused variable ‘baseifp’
[arnuld@dune programs]$




Everything is explained in the comments, I have these ideas:

1) First #define __GNU_SOURCE has to go, its not a standard C facility.
2) getline() is not a C function, so I think using fgets() will be a
better idea ?

Will post the code as soon as I rewrite it. Till then can I have your
views ?
 
B

Ben Bacarisse

arnuld said:
Everything is explained in the comments, I have these ideas:

1) First #define __GNU_SOURCE has to go, its not a standard C facility.
2) getline() is not a C function, so I think using fgets() will be a
better idea ?

The program doesn't "use" getline. The call is pointless and looks
like a left-over from some previous version.
Will post the code as soon as I rewrite it. Till then can I have your
views ?

I think "re-write" is the wrong word. Just start again since almost
nothing is worth preserving. If the author had posted here, I'd would
be *much* more encouraging since it looks like an honest attempt by a
beginner, but there is not mot much point in your trying to "improve"
it.
 
U

user923005

/* A C program that reads a file and copies the contents to a new file
while discarding all the repeated words.
 * Written by one of my friends, posted by me on CLC for constructive
criticism. I dont' think its a standard
 * C program, hence I posted it here to make it one :)
 *
 * VERSION 0.0
 *
 */

#define __GNU__SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main ()
{
        char str1[50] = {0};
        char *array[100];
        FILE *ifp,*ofp;
        FILE *baseifp;

        int i = 0,k=0,flag;
        ifp = fopen("myfile", "r");
        if(ifp==NULL)
                perror("input File is not open");
        ofp = fopen("outputfile", "w");
        if(ofp==NULL)
                perror("output File is not open");

        char * line = NULL;
        size_t len = 0;
        ssize_t read;
        /* print read elements on stdout */
        while ((read = getline(&line, &len, ifp)) != -1) {
                printf("Retrieved line of length %zu :\n", read);
                printf("%s", line);
        }
        if (line)
                free(line);
        //fclose (ifp);
        ifp = freopen("myfile", "r", ifp);

        while(fscanf(ifp, "%s", str1)!=EOF)
        {
                printf ("%s\n",str1);
                flag = 0;
                array = (char *)malloc (strlen (str1)+1);
                strcpy(array,str1);

                if(i > 0)
                        for (k = 0; k < i ; k++)
                        {
                                if (strcmp(array[k], str1)==0)
                                {
                                        flag = 1;
                                        break;
                                }
                        }
                if (flag == 0)
                {
                        fprintf(ofp, "%s ", str1);
                }
                i++;
                memset (str1, 0, 50);
        }
        printf ("\n");
        fclose(ifp);
        fclose(ofp);

        return 0;

}

================== OUTPUT ========================

[arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
repeated-words.c
remove-repeated-words.c: In function ‘main’:
remove-repeated-words.c:33: error: ‘ssize_t’ undeclared (first use in
this function)
remove-repeated-words.c:33: error: (Each undeclared identifier is
reported only once
remove-repeated-words.c:33: error: for each function it appears in.)
remove-repeated-words.c:33: error: expected ‘;’ before ‘read’
remove-repeated-words.c:35: error: ‘read’ undeclared (first use in this
function)
remove-repeated-words.c:35: warning: implicit declaration of function
‘getline’
remove-repeated-words.c:19: warning: unused variable ‘baseifp’
[arnuld@dune programs]$

Everything is explained in the comments, I have these ideas:

1) First #define __GNU_SOURCE has to go, its not a standard C facility.
2) getline() is not a C function, so I think using fgets() will be a
better idea ?

Will post the code as soon as I rewrite it. Till then can I have your
views ?

--www.lispmachine.wordpress.com
my email is @ the above blog.


I would rewrite it from scratch, using fgets() and strtok().

The definition is unclear about repeated words.
Does the program need to understand punctuation and capitalization?
Is the goal to actually create a dictionary of unique words?

If it is to be something akin to a spell checker, but having the
function of duplicate word detection, then it is really a very
difficult problem.
And it probably shouldn't always do what is requested. For instance
(from a Monty Python Script):
John: "Oh Marsha, I could make a fool of myself!"
Marsha: "Oh yes, John... Do! Do!"
<John puts on gag glasses with funny nose and moustache attached>

So, my two cents:
1. Making a dictionary of unique words from a file is easy.
2. Removing duplicate words from a file ignoring case and punctuation
is much harder.
3. Actual correction of English text so that the intent is preserved
is an incredibly difficult problem.

In any case, the above attempt accomplishes none of the above and
should be re-written from scratch.
IMO-YMMV.
 
U

user923005

/* A C program that reads a file and copies the contents to a new file
while discarding all the repeated words.
 * Written by one of my friends, posted by me on CLC for constructive
criticism. I dont' think its a standard
 * C program, hence I posted it here to make it one :)
 *
 * VERSION 0.0
 *
 */
#define __GNU__SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main ()
{
        char str1[50] = {0};
        char *array[100];
        FILE *ifp,*ofp;
        FILE *baseifp;
        int i = 0,k=0,flag;
        ifp = fopen("myfile", "r");
        if(ifp==NULL)
                perror("input File is not open");
        ofp = fopen("outputfile", "w");
        if(ofp==NULL)
                perror("output File is not open");
        char * line = NULL;
        size_t len = 0;
        ssize_t read;
        /* print read elements on stdout */
        while ((read = getline(&line, &len, ifp)) != -1) {
                printf("Retrieved line of length %zu :\n", read);
                printf("%s", line);
        }
        if (line)
                free(line);
        //fclose (ifp);
        ifp = freopen("myfile", "r", ifp);
        while(fscanf(ifp, "%s", str1)!=EOF)
        {
                printf ("%s\n",str1);
                flag = 0;
                array = (char *)malloc (strlen (str1)+1);
                strcpy(array,str1);

                if(i > 0)
                        for (k = 0; k < i ; k++)
                        {
                                if (strcmp(array[k], str1)==0)
                                {
                                        flag = 1;
                                        break;
                                }
                        }
                if (flag == 0)
                {
                        fprintf(ofp, "%s ", str1);
                }
                i++;
                memset (str1, 0, 50);
        }
        printf ("\n");
        fclose(ifp);
        fclose(ofp);
        return 0;

================== OUTPUT ========================
[arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
repeated-words.c
remove-repeated-words.c: In function ‘main’:
remove-repeated-words.c:33: error: ‘ssize_t’ undeclared (first use in
this function)
remove-repeated-words.c:33: error: (Each undeclared identifier is
reported only once
remove-repeated-words.c:33: error: for each function it appears in.)
remove-repeated-words.c:33: error: expected ‘;’ before ‘read’
remove-repeated-words.c:35: error: ‘read’ undeclared (first use in this
function)
remove-repeated-words.c:35: warning: implicit declaration of function
‘getline’
remove-repeated-words.c:19: warning: unused variable ‘baseifp’
[arnuld@dune programs]$
Everything is explained in the comments, I have these ideas:
1) First #define __GNU_SOURCE has to go, its not a standard C facility.
2) getline() is not a C function, so I think using fgets() will be a
better idea ?
Will post the code as soon as I rewrite it. Till then can I have your
views ?
--www.lispmachine.wordpress.com
my email is @ the above blog.

I would rewrite it from scratch, using fgets() and strtok().

The definition is unclear about repeated words.
Does the program need to understand punctuation and capitalization?
Is the goal to actually create a dictionary of unique words?

If it is to be something akin to a spell checker, but having the
function of duplicate word detection, then it is really a very
difficult problem.
And it probably shouldn't always do what is requested.  For instance
(from a Monty Python Script):
John: "Oh Marsha, I could make a fool of myself!"
Marsha: "Oh yes, John... Do! Do!"
<John puts on gag glasses with funny nose and moustache attached>

So, my two cents:
1.  Making a dictionary of unique words from a file is easy.
2.  Removing duplicate words from a file ignoring case and punctuation
is much harder.
3.  Actual correction of English text so that the intent is preserved
is an incredibly difficult problem.

In any case, the above attempt accomplishes none of the above and
should be re-written from scratch.
IMO-YMMV.


Maybe something like this:

/*
Purpose:
Primitive program to detect and remove repeated words.
It does not understand hyphenated continuations.
It does not understand capitalization.
It does not understand punctuation.
It does not understand repetition for empahsis.
It's dumb as a box of hammers.

Limits:
It won't work with lines or words bigger than 64K.

Side effects:
It strips out punctuation.
It turns all white space into plain space chars.
It turns all words into lower case words.

Notes:
Use at your own peril.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

static char string[65535];
static char save_token_string[65535];

void clean_string(char *s)
{
while (*s) {
if (ispunct(*s)) *s = ' ';
else if (isspace(*s)) *s = ' ';
else if (isupper(*s)) *s = (char) tolower(*s);
s++;
}
}

int main(void)
{
char *token = 0;
const char *previous_token = "";
char *data;
while (data = fgets(string, sizeof string, stdin)) {

clean_string(data);
token = strtok(string, " ");
while (token != NULL) {
if (strcmp(token, previous_token) != 0)
printf("%s ", token);
strcpy(save_token_string, token);
previous_token = save_token_string;
token = strtok(NULL, " ");
}
}
return 0;
}
/*
Input file:
C:\tmp>type pitts.dat
Paris in the
the Spring.

Output:
paris in the spring
*/
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,755
Messages
2,569,536
Members
45,012
Latest member
RoxanneDzm

Latest Threads

Top