Remove repeated words from a file

Discussion in 'C Programming' started by arnuld, Sep 18, 2009.

  1. arnuld

    arnuld Guest

    /* A C program that reads a file and copies the contents to a new file
    while discarding all the repeated words.
    * Written by one of my friends, posted by me on CLC for constructive
    criticism. I dont' think its a standard
    * C program, hence I posted it here to make it one :)
    *
    * VERSION 0.0
    *
    */


    #define __GNU__SOURCE
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    int main ()
    {
    char str1[50] = {0};
    char *array[100];
    FILE *ifp,*ofp;
    FILE *baseifp;


    int i = 0,k=0,flag;
    ifp = fopen("myfile", "r");
    if(ifp==NULL)
    perror("input File is not open");
    ofp = fopen("outputfile", "w");
    if(ofp==NULL)
    perror("output File is not open");


    char * line = NULL;
    size_t len = 0;
    ssize_t read;
    /* print read elements on stdout */
    while ((read = getline(&line, &len, ifp)) != -1) {
    printf("Retrieved line of length %zu :\n", read);
    printf("%s", line);
    }
    if (line)
    free(line);
    //fclose (ifp);
    ifp = freopen("myfile", "r", ifp);

    while(fscanf(ifp, "%s", str1)!=EOF)
    {
    printf ("%s\n",str1);
    flag = 0;
    array = (char *)malloc (strlen (str1)+1);
    strcpy(array,str1);

    if(i > 0)
    for (k = 0; k < i ; k++)
    {
    if (strcmp(array[k], str1)==0)
    {
    flag = 1;
    break;
    }
    }
    if (flag == 0)
    {
    fprintf(ofp, "%s ", str1);
    }
    i++;
    memset (str1, 0, 50);
    }
    printf ("\n");
    fclose(ifp);
    fclose(ofp);

    return 0;
    }

    ================== OUTPUT ========================

    [arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
    repeated-words.c
    remove-repeated-words.c: In function ‘main’:
    remove-repeated-words.c:33: error: ‘ssize_t’ undeclared (first use in
    this function)
    remove-repeated-words.c:33: error: (Each undeclared identifier is
    reported only once
    remove-repeated-words.c:33: error: for each function it appears in.)
    remove-repeated-words.c:33: error: expected ‘;’ before ‘read’
    remove-repeated-words.c:35: error: ‘read’ undeclared (first use in this
    function)
    remove-repeated-words.c:35: warning: implicit declaration of function
    ‘getline’
    remove-repeated-words.c:19: warning: unused variable ‘baseifp’
    [arnuld@dune programs]$




    Everything is explained in the comments, I have these ideas:

    1) First #define __GNU_SOURCE has to go, its not a standard C facility.
    2) getline() is not a C function, so I think using fgets() will be a
    better idea ?

    Will post the code as soon as I rewrite it. Till then can I have your
    views ?



    --
    www.lispmachine.wordpress.com
    my email is @ the above blog.
    arnuld, Sep 18, 2009
    #1
    1. Advertising

  2. arnuld <> writes:
    <snip>
    >
    > Everything is explained in the comments, I have these ideas:
    >
    > 1) First #define __GNU_SOURCE has to go, its not a standard C facility.
    > 2) getline() is not a C function, so I think using fgets() will be a
    > better idea ?


    The program doesn't "use" getline. The call is pointless and looks
    like a left-over from some previous version.

    > Will post the code as soon as I rewrite it. Till then can I have your
    > views ?


    I think "re-write" is the wrong word. Just start again since almost
    nothing is worth preserving. If the author had posted here, I'd would
    be *much* more encouraging since it looks like an honest attempt by a
    beginner, but there is not mot much point in your trying to "improve"
    it.

    --
    Ben.
    Ben Bacarisse, Sep 18, 2009
    #2
    1. Advertising

  3. arnuld

    user923005 Guest

    On Sep 18, 12:46 am, arnuld <> wrote:
    > /* A C program that reads a file and copies the contents to a new file
    > while discarding all the repeated words.
    >  * Written by one of my friends, posted by me on CLC for constructive
    > criticism. I dont' think its a standard
    >  * C program, hence I posted it here to make it one :)
    >  *
    >  * VERSION 0.0
    >  *
    >  */
    >
    > #define __GNU__SOURCE
    > #include <stdio.h>
    > #include <stdlib.h>
    > #include <string.h>
    > int main ()
    > {
    >         char str1[50] = {0};
    >         char *array[100];
    >         FILE *ifp,*ofp;
    >         FILE *baseifp;
    >
    >         int i = 0,k=0,flag;
    >         ifp = fopen("myfile", "r");
    >         if(ifp==NULL)
    >                 perror("input File is not open");
    >         ofp = fopen("outputfile", "w");
    >         if(ofp==NULL)
    >                 perror("output File is not open");
    >
    >         char * line = NULL;
    >         size_t len = 0;
    >         ssize_t read;
    >         /* print read elements on stdout */
    >         while ((read = getline(&line, &len, ifp)) != -1) {
    >                 printf("Retrieved line of length %zu :\n", read);
    >                 printf("%s", line);
    >         }
    >         if (line)
    >                 free(line);
    >         //fclose (ifp);
    >         ifp = freopen("myfile", "r", ifp);
    >
    >         while(fscanf(ifp, "%s", str1)!=EOF)
    >         {
    >                 printf ("%s\n",str1);
    >                 flag = 0;
    >                 array = (char *)malloc (strlen (str1)+1);
    >                 strcpy(array,str1);
    >
    >                 if(i > 0)
    >                         for (k = 0; k < i ; k++)
    >                         {
    >                                 if (strcmp(array[k], str1)==0)
    >                                 {
    >                                         flag = 1;
    >                                         break;
    >                                 }
    >                         }
    >                 if (flag == 0)
    >                 {
    >                         fprintf(ofp, "%s ", str1);
    >                 }
    >                 i++;
    >                 memset (str1, 0, 50);
    >         }
    >         printf ("\n");
    >         fclose(ifp);
    >         fclose(ofp);
    >
    >         return 0;
    >
    > }
    >
    > ================== OUTPUT ========================
    >
    > [arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
    > repeated-words.c
    > remove-repeated-words.c: In function ‘main’:
    > remove-repeated-words.c:33: error: ‘ssize_t’ undeclared (first use in
    > this function)
    > remove-repeated-words.c:33: error: (Each undeclared identifier is
    > reported only once
    > remove-repeated-words.c:33: error: for each function it appears in.)
    > remove-repeated-words.c:33: error: expected ‘;’ before ‘read’
    > remove-repeated-words.c:35: error: ‘read’ undeclared (first use in this
    > function)
    > remove-repeated-words.c:35: warning: implicit declaration of function
    > ‘getline’
    > remove-repeated-words.c:19: warning: unused variable ‘baseifp’
    > [arnuld@dune programs]$
    >
    > Everything is explained in the comments, I have these ideas:
    >
    > 1) First #define __GNU_SOURCE has to go, its not a standard C facility.
    > 2) getline() is not a C function, so I think using fgets() will be a
    > better idea ?
    >
    > Will post the code as soon as I rewrite it. Till then can I have your
    > views ?
    >
    > --www.lispmachine.wordpress.com
    > my email is @ the above blog.


    I would rewrite it from scratch, using fgets() and strtok().

    The definition is unclear about repeated words.
    Does the program need to understand punctuation and capitalization?
    Is the goal to actually create a dictionary of unique words?

    If it is to be something akin to a spell checker, but having the
    function of duplicate word detection, then it is really a very
    difficult problem.
    And it probably shouldn't always do what is requested. For instance
    (from a Monty Python Script):
    John: "Oh Marsha, I could make a fool of myself!"
    Marsha: "Oh yes, John... Do! Do!"
    <John puts on gag glasses with funny nose and moustache attached>

    So, my two cents:
    1. Making a dictionary of unique words from a file is easy.
    2. Removing duplicate words from a file ignoring case and punctuation
    is much harder.
    3. Actual correction of English text so that the intent is preserved
    is an incredibly difficult problem.

    In any case, the above attempt accomplishes none of the above and
    should be re-written from scratch.
    IMO-YMMV.
    user923005, Sep 18, 2009
    #3
  4. arnuld

    user923005 Guest

    On Sep 18, 2:57 pm, user923005 <> wrote:
    > On Sep 18, 12:46 am, arnuld <> wrote:
    >
    >
    >
    >
    >
    > > /* A C program that reads a file and copies the contents to a new file
    > > while discarding all the repeated words.
    > >  * Written by one of my friends, posted by me on CLC for constructive
    > > criticism. I dont' think its a standard
    > >  * C program, hence I posted it here to make it one :)
    > >  *
    > >  * VERSION 0.0
    > >  *
    > >  */

    >
    > > #define __GNU__SOURCE
    > > #include <stdio.h>
    > > #include <stdlib.h>
    > > #include <string.h>
    > > int main ()
    > > {
    > >         char str1[50] = {0};
    > >         char *array[100];
    > >         FILE *ifp,*ofp;
    > >         FILE *baseifp;

    >
    > >         int i = 0,k=0,flag;
    > >         ifp = fopen("myfile", "r");
    > >         if(ifp==NULL)
    > >                 perror("input File is not open");
    > >         ofp = fopen("outputfile", "w");
    > >         if(ofp==NULL)
    > >                 perror("output File is not open");

    >
    > >         char * line = NULL;
    > >         size_t len = 0;
    > >         ssize_t read;
    > >         /* print read elements on stdout */
    > >         while ((read = getline(&line, &len, ifp)) != -1) {
    > >                 printf("Retrieved line of length %zu :\n", read);
    > >                 printf("%s", line);
    > >         }
    > >         if (line)
    > >                 free(line);
    > >         //fclose (ifp);
    > >         ifp = freopen("myfile", "r", ifp);

    >
    > >         while(fscanf(ifp, "%s", str1)!=EOF)
    > >         {
    > >                 printf ("%s\n",str1);
    > >                 flag = 0;
    > >                 array = (char *)malloc (strlen (str1)+1);
    > >                 strcpy(array,str1);

    >
    > >                 if(i > 0)
    > >                         for (k = 0; k < i ; k++)
    > >                         {
    > >                                 if (strcmp(array[k], str1)==0)
    > >                                 {
    > >                                         flag = 1;
    > >                                         break;
    > >                                 }
    > >                         }
    > >                 if (flag == 0)
    > >                 {
    > >                         fprintf(ofp, "%s ", str1);
    > >                 }
    > >                 i++;
    > >                 memset (str1, 0, 50);
    > >         }
    > >         printf ("\n");
    > >         fclose(ifp);
    > >         fclose(ofp);

    >
    > >         return 0;

    >
    > > }

    >
    > > ================== OUTPUT ========================

    >
    > > [arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
    > > repeated-words.c
    > > remove-repeated-words.c: In function ‘main’:
    > > remove-repeated-words.c:33: error: ‘ssize_t’ undeclared (first use in
    > > this function)
    > > remove-repeated-words.c:33: error: (Each undeclared identifier is
    > > reported only once
    > > remove-repeated-words.c:33: error: for each function it appears in.)
    > > remove-repeated-words.c:33: error: expected ‘;’ before ‘read’
    > > remove-repeated-words.c:35: error: ‘read’ undeclared (first use in this
    > > function)
    > > remove-repeated-words.c:35: warning: implicit declaration of function
    > > ‘getline’
    > > remove-repeated-words.c:19: warning: unused variable ‘baseifp’
    > > [arnuld@dune programs]$

    >
    > > Everything is explained in the comments, I have these ideas:

    >
    > > 1) First #define __GNU_SOURCE has to go, its not a standard C facility.
    > > 2) getline() is not a C function, so I think using fgets() will be a
    > > better idea ?

    >
    > > Will post the code as soon as I rewrite it. Till then can I have your
    > > views ?

    >
    > > --www.lispmachine.wordpress.com
    > > my email is @ the above blog.

    >
    > I would rewrite it from scratch, using fgets() and strtok().
    >
    > The definition is unclear about repeated words.
    > Does the program need to understand punctuation and capitalization?
    > Is the goal to actually create a dictionary of unique words?
    >
    > If it is to be something akin to a spell checker, but having the
    > function of duplicate word detection, then it is really a very
    > difficult problem.
    > And it probably shouldn't always do what is requested.  For instance
    > (from a Monty Python Script):
    > John: "Oh Marsha, I could make a fool of myself!"
    > Marsha: "Oh yes, John... Do! Do!"
    > <John puts on gag glasses with funny nose and moustache attached>
    >
    > So, my two cents:
    > 1.  Making a dictionary of unique words from a file is easy.
    > 2.  Removing duplicate words from a file ignoring case and punctuation
    > is much harder.
    > 3.  Actual correction of English text so that the intent is preserved
    > is an incredibly difficult problem.
    >
    > In any case, the above attempt accomplishes none of the above and
    > should be re-written from scratch.
    > IMO-YMMV.


    Maybe something like this:

    /*
    Purpose:
    Primitive program to detect and remove repeated words.
    It does not understand hyphenated continuations.
    It does not understand capitalization.
    It does not understand punctuation.
    It does not understand repetition for empahsis.
    It's dumb as a box of hammers.

    Limits:
    It won't work with lines or words bigger than 64K.

    Side effects:
    It strips out punctuation.
    It turns all white space into plain space chars.
    It turns all words into lower case words.

    Notes:
    Use at your own peril.
    */
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <ctype.h>

    static char string[65535];
    static char save_token_string[65535];

    void clean_string(char *s)
    {
    while (*s) {
    if (ispunct(*s)) *s = ' ';
    else if (isspace(*s)) *s = ' ';
    else if (isupper(*s)) *s = (char) tolower(*s);
    s++;
    }
    }

    int main(void)
    {
    char *token = 0;
    const char *previous_token = "";
    char *data;
    while (data = fgets(string, sizeof string, stdin)) {

    clean_string(data);
    token = strtok(string, " ");
    while (token != NULL) {
    if (strcmp(token, previous_token) != 0)
    printf("%s ", token);
    strcpy(save_token_string, token);
    previous_token = save_token_string;
    token = strtok(NULL, " ");
    }
    }
    return 0;
    }
    /*
    Input file:
    C:\tmp>type pitts.dat
    Paris in the
    the Spring.

    Output:
    paris in the spring
    */
    user923005, Sep 18, 2009
    #4
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Peter Strøiman
    Replies:
    1
    Views:
    2,066
    Peter Strøiman
    Aug 23, 2005
  2. Daniele Menozzi
    Replies:
    9
    Views:
    8,726
    Roedy Green
    Jul 18, 2005
  3. arnuld
    Replies:
    10
    Views:
    1,736
    =?ISO-8859-1?Q?Erik_Wikstr=F6m?=
    Aug 3, 2007
  4. BerlinBrown
    Replies:
    6
    Views:
    4,420
  5. candide

    Extracting repeated words

    candide, Apr 1, 2011, in forum: Python
    Replies:
    2
    Views:
    295
    candide
    Apr 2, 2011
Loading...

Share This Page