Comparing Two Files line by line and word by word

Discussion in 'C Programming' started by Frost, Jan 11, 2006.

  1. Frost

    Frost Guest

    Hi All,

    I am a newbie i have written a c program on unix for line by line
    comparison for two files now could some one help on how i could do word
    by word comparison in case both lines have the same words but in
    jumbled order they should match and print only the dissimilar lines.The
    program also checks for multiple entries of the same line.

    Here file 2 converts to file 3 which is in the format of file1 and i
    compare file1 with file3.
    this program right now is checking line by line and for multiple
    entries of the same line,
    the program i wrote for word to word comparison is included in the
    comments which is not working if some could fix i would really be
    relieved.
    ---------------------------------------------------------------------------------------------------------------------------------------
    //==================================================================

    #include<stdio.h>

    #include<stdlib.h>

    #include<string.h>

    #define MAX 10000

    //==================================================================

    main(int argc,char *argv[])

    {

    FILE *fp1,*fp2,*fp3;

    void filecomp(FILE *,FILE *);

    void fileconv(FILE *,FILE *);

    if(argc!=4)

    {

    fprintf(stderr,"need two files\n");

    exit(1);

    }

    fp3=fopen(argv[3],"w");

    fp2=fopen(argv[2],"r");

    if(fp2==NULL)

    {

    printf("cannot open file2\n");

    exit(1);

    }

    if (fp3==NULL)

    {

    printf("cannot open file3\n");

    exit(1);

    }

    fileconv(fp2,fp3);

    fclose(fp2);

    fclose(fp3);

    fp1=fopen(argv[1],"r");

    if(fp1==NULL)

    {

    printf("cannot open file1\n");

    exit(1);

    }

    fp3=fopen(argv[3],"r");

    if(fp3==NULL)

    {

    printf("cannot open file3\n");

    exit(1);

    }

    filecomp(fp1,fp3);

    fclose(fp1);

    fclose(fp3);

    exit(0);

    }
    //==================================================================

    void fileconv(FILE *fp2,FILE *fp3)

    {

    char c;

    while((c=fgetc(fp2))!=EOF)

    {

    if(c=='.')

    {

    c='/';

    fputc(c,fp3);

    }

    else

    fputc(c,fp3);

    }

    }

    //==================================================================

    void filecomp(FILE *fp1,FILE *fp3)

    {

    char line1[MAX],line2[MAX];

    char *s1,*s2;

    int ctr,octr,a=0,b=1;

    int i,count1=0,count2=1,count3=0,count4=0;

    while(((s1=fgets(line1,MAX,fp1))!=NULL))

    {

    count1++;

    while((s2=fgets(line2,MAX,fp3))!=NULL)

    {
    /* while (ctr!=EOF)
    {
    ctr=fgetc(fp1);
    while(octr!=EOF)
    {
    octr=fgetc(fp3);
    while(octr!='\n')
    {
    a++;
    if(ctr==octr)
    {
    fseek(fp1,-a,1);
    ctr=fgetc(fp1);
    octr=fgetc(fp3);
    b++;
    }
    while(ctr=='\n')
    {
    a=0;
    ctr=fgetc(fp1);
    }
    }
    b=0;
    }
    }*/

    i=strcmp(s1,s2);

    if(i==0)

    {
    count3++;

    }
    else
    {
    // printf("line %d of file1 is not equal to line %d of
    file3\n\n",count2,count1);

    }
    count2++;

    }
    if(count3==0)
    printf("line %d of file1 does not match lines of
    file3\n",count1);
    count2 = 1;
    count3 = 0;
    fseek(fp3,0,SEEK_SET);

    }

    if((s2=fgets(line2,MAX,fp3))!=NULL)


    //printf("file1 has ended but not file3\n");

    //else

    printf("both files have ended\n");

    }
    ---------------------------------------------------------------------------------------------------------------------------------------thanking
    u,
    Frost
     
    Frost, Jan 11, 2006
    #1
    1. Advertising

  2. Frost <> wrote:

    > I am a newbie i have written a c program on unix for line by line
    > comparison for two files now could some one help on how i could do word
    > by word comparison in case both lines have the same words but in
    > jumbled order they should match and print only the dissimilar lines.The
    > program also checks for multiple entries of the same line.


    > Here file 2 converts to file 3 which is in the format of file1 and i
    > compare file1 with file3.
    > this program right now is checking line by line and for multiple
    > entries of the same line,


    > (code trimmed)


    I don't have any comments on your code yet other than that the
    formatting was atrocious. I know posting well-formatted code to
    Usenet with Google's interface is a huge chore, but commenting on code
    that looks that bad is not worth the effort for your average
    comp.lang.c. guru. I've taken the liberty of reformatting it, which
    may be enough to get you some useful feedback:

    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>

    #define MAX 10000

    main( int argc, char *argv[] )
    {
    FILE *fp1, *fp2, *fp3;
    void filecomp( FILE *, FILE * );
    void fileconv( FILE *, FILE * );

    if( argc != 4 ) {
    fprintf( stderr, "need two files\n" );
    exit( 1 );
    }
    fp3=fopen( argv[3], "w" );
    fp2=fopen( argv[2], "r" );
    if( fp2 == NULL ) {
    printf( "cannot open file2\n" );
    exit( 1 );
    }
    if( fp3 == NULL ) {
    printf( "cannot open file3\n" );
    exit( 1 );
    }
    fileconv( fp2, fp3 );
    fclose( fp2 );
    fclose( fp3 );
    fp1=fopen( argv[1], "r" );
    if( fp1 == NULL ) {
    printf( "cannot open file1\n" );
    exit( 1 );
    }
    fp3=fopen( argv[3], "r" );
    if( fp3 == NULL ) {
    printf("cannot open file3\n");
    exit( 1 );
    }
    filecomp( fp1, fp3 );
    fclose( fp1 );
    fclose( fp3 );
    exit( 0 );
    }

    void fileconv( FILE *fp2, FILE *fp3 )
    {
    char c;
    while( (c=fgetc(fp2)) != EOF ) {
    if( c == '.' ) {
    c='/';
    fputc(c,fp3);
    }
    else
    fputc( c, fp3 );
    }
    }

    void filecomp( FILE *fp1, FILE *fp3 )
    {
    char line1[MAX], line2[MAX];
    char *s1, *s2;
    int ctr, octr, a=0, b=1;
    int i,count1=0, count2=1, count3=0, count4=0;
    while( ((s1=fgets(line1,MAX,fp1)) != NULL) ) {
    count1++;
    while( (s2=fgets(line2,MAX,fp3)) != NULL ) {
    i=strcmp( s1, s2 );
    if( i == 0 ) {
    count3++;
    }
    count2++;
    }
    if( count3==0 )
    printf("line %d of file1 does not match lines of file3\n", count1 );
    count2=1;
    count3=0;
    fseek( fp3, 0, SEEK_SET );
    }
    if( (s2=fgets(line2,MAX,fp3)) != NULL )
    printf( "both files have ended\n" );
    }

    --
    Christopher Benson-Manica | I *should* know what I'm talking about - if I
    ataru(at)cyberspace.org | don't, I need to know. Flames welcome.
     
    Christopher Benson-Manica, Jan 11, 2006
    #2
    1. Advertising

  3. Frost

    Frost Guest

    thamks for ur help next time ill make sure code is formated properly
     
    Frost, Jan 12, 2006
    #3
  4. Frost

    Chuck F. Guest

    Frost wrote:
    > thamks for ur help next time ill make sure code is formated properly

    ^^ ^^^

    Ignoring typos, those words are probably "your" and "I'll". Proper
    spelling and capitalization makes things much easier to read. Also
    read (and heed) the instructions and references below, to include
    adequate context.

    --
    "If you want to post a followup via groups.google.com, don't use
    the broken "Reply" link at the bottom of the article. Click on
    "show options" at the top of the article, then click on the
    "Reply" at the bottom of the article headers." - Keith Thompson
    More details at: <http://cfaj.freeshell.org/google/>
     
    Chuck F., Jan 12, 2006
    #4
  5. Frost

    Frost Guest

    Christopher Benson-Manica wrote:

    > Frost <> wrote:
    >
    > > I am a newbie i have written a c program on unix for line by line
    > > comparison for two files now could some one help on how i could do word
    > > by word comparison in case both lines have the same words but in
    > > jumbled order they should match and print only the dissimilar lines.The
    > > program also checks for multiple entries of the same line.

    >
    > > Here file 2 converts to file 3 which is in the format of file1 and i
    > > compare file1 with file3.
    > > this program right now is checking line by line and for multiple
    > > entries of the same line,and prints only dissimilar lines please can u give me a program which i can check word my word so that if the lines contain the same words but in different positions they should match.

    >
    > #include<stdio.h>
    > #include<stdlib.h>
    > #include<string.h>
    >
    > #define MAX 10000
    >
    > main( int argc, char *argv[] )
    > {
    > FILE *fp1, *fp2, *fp3;
    > void filecomp( FILE *, FILE * );
    > void fileconv( FILE *, FILE * );
    >
    > if( argc != 4 ) {
    > fprintf( stderr, "need two files\n" );
    > exit( 1 );
    > }
    > fp3=fopen( argv[3], "w" );
    > fp2=fopen( argv[2], "r" );
    > if( fp2 == NULL ) {
    > printf( "cannot open file2\n" );
    > exit( 1 );
    > }
    > if( fp3 == NULL ) {
    > printf( "cannot open file3\n" );
    > exit( 1 );
    > }
    > fileconv( fp2, fp3 );
    > fclose( fp2 );
    > fclose( fp3 );
    > fp1=fopen( argv[1], "r" );
    > if( fp1 == NULL ) {
    > printf( "cannot open file1\n" );
    > exit( 1 );
    > }
    > fp3=fopen( argv[3], "r" );
    > if( fp3 == NULL ) {
    > printf("cannot open file3\n");
    > exit( 1 );
    > }
    > filecomp( fp1, fp3 );
    > fclose( fp1 );
    > fclose( fp3 );
    > exit( 0 );
    > }
    >
    > void fileconv( FILE *fp2, FILE *fp3 )
    > {
    > char c;
    > while( (c=fgetc(fp2)) != EOF ) {
    > if( c == '.' ) {
    > c='/';
    > fputc(c,fp3);
    > }
    > else
    > fputc( c, fp3 );
    > }
    > }
    >
    > void filecomp( FILE *fp1, FILE *fp3 )
    > {
    > char line1[MAX], line2[MAX];
    > char *s1, *s2;
    > int ctr, octr, a=0, b=1;
    > int i,count1=0, count2=1, count3=0, count4=0;
    > while( ((s1=fgets(line1,MAX,fp1)) != NULL) ) {
    > count1++;
    > while( (s2=fgets(line2,MAX,fp3)) != NULL ) {
    > i=strcmp( s1, s2 );
    > if( i == 0 ) {
    > count3++;
    > }
    > count2++;
    > }
    > if( count3==0 )
    > printf("line %d of file1 does not match lines of file3\n", count1 );
    > count2=1;
    > count3=0;
    > fseek( fp3, 0, SEEK_SET );
    > }
    > if( (s2=fgets(line2,MAX,fp3)) != NULL )
    > printf( "both files have ended\n" );
    > }
    >
     
    Frost, Jan 13, 2006
    #5
  6. Frost

    Michael Mair Guest

    Frost wrote:
    > Christopher Benson-Manica wrote:
    >
    >>Frost <> wrote:
    >>
    >>>I am a newbie i have written a c program on unix for line by line
    >>>comparison for two files now could some one help on how i could do word
    >>>by word comparison in case both lines have the same words but in
    >>>jumbled order they should match and print only the dissimilar lines.The
    >>>program also checks for multiple entries of the same line.

    >>
    >>>Here file 2 converts to file 3 which is in the format of file1 and i
    >>>compare file1 with file3.
    >>>this program right now is checking line by line and for multiple
    >>>entries of the same line,and prints only dissimilar lines please can u give me a program which i can check word my word so that if the lines contain the same words but in different positions they should match.


    I could, but I will not. At least not directly.
    Let us first clean up your program. Then we will discuss what to do
    to make it more efficient. The extension of what we do line by line
    can be used for the word by word version.

    >>#include<stdio.h>
    >>#include<stdlib.h>
    >>#include<string.h>
    >>
    >>#define MAX 10000
    >>
    >>main( int argc, char *argv[] )


    main returns int. Do not rely on implicit int as
    C99 made it illegal.
    Write
    int main (int argc, char **argv)
    or
    int main (void)
    instead.

    >>{
    >> FILE *fp1, *fp2, *fp3;
    >> void filecomp( FILE *, FILE * );
    >> void fileconv( FILE *, FILE * );


    This can bite you later on if you move the functionality
    out of main(), especially if you rely on implicit int elsewhere
    and do not turn up the warning level of your compiler...

    >> if( argc != 4 ) {
    >> fprintf( stderr, "need two files\n" );
    >> exit( 1 );


    Non-portable argument to exit(): Use EXIT_FAILURE instead.
    To signal success, use EXIT_SUCCESS or 0.

    >> }
    >> fp3=fopen( argv[3], "w" );
    >> fp2=fopen( argv[2], "r" );
    >> if( fp2 == NULL ) {
    >> printf( "cannot open file2\n" );
    >> exit( 1 );
    >> }
    >> if( fp3 == NULL ) {
    >> printf( "cannot open file3\n" );
    >> exit( 1 );
    >> }
    >> fileconv( fp2, fp3 );
    >> fclose( fp2 );
    >> fclose( fp3 );
    >> fp1=fopen( argv[1], "r" );
    >> if( fp1 == NULL ) {
    >> printf( "cannot open file1\n" );
    >> exit( 1 );
    >> }
    >> fp3=fopen( argv[3], "r" );
    >> if( fp3 == NULL ) {
    >> printf("cannot open file3\n");
    >> exit( 1 );
    >> }


    Instead of closing and reopening, just open fp3 as "w+"
    and rewind() it.

    >> filecomp( fp1, fp3 );
    >> fclose( fp1 );
    >> fclose( fp3 );
    >> exit( 0 );


    Matter of taste:
    return 0;
    does not do exactly the same thing, but is appropriate
    as well.
    >>}
    >>
    >>void fileconv( FILE *fp2, FILE *fp3 )
    >>{
    >> char c;


    The return type of fgetc() is int.
    EOF is a negative int value.
    char may be an unsigned type.

    Moreover, fgetc() returns the character value converted
    to unsigned char or EOF.
    In short:
    Make c an int.

    >> while( (c=fgetc(fp2)) != EOF ) {


    Note: The macro getc() does the same but may be implemented
    more efficiently.

    >> if( c == '.' ) {
    >> c='/';
    >> fputc(c,fp3);
    >> }
    >> else
    >> fputc( c, fp3 );
    >> }
    >>}
    >>
    >>void filecomp( FILE *fp1, FILE *fp3 )
    >>{
    >> char line1[MAX], line2[MAX];
    >> char *s1, *s2;
    >> int ctr, octr, a=0, b=1;
    >> int i,count1=0, count2=1, count3=0, count4=0;


    Look at these names and tell me you will know what they mean
    in twelve months' time. Keep a straight face.

    >> while( ((s1=fgets(line1,MAX,fp1)) != NULL) ) {
    >> count1++;
    >> while( (s2=fgets(line2,MAX,fp3)) != NULL ) {
    >> i=strcmp( s1, s2 );
    >> if( i == 0 ) {
    >> count3++;
    >> }
    >> count2++;
    >> }
    >> if( count3==0 )
    >> printf("line %d of file1 does not match lines of file3\n", count1 );
    >> count2=1;
    >> count3=0;
    >> fseek( fp3, 0, SEEK_SET );
    >> }
    >> if( (s2=fgets(line2,MAX,fp3)) != NULL )
    >> printf( "both files have ended\n" );
    >>}


    What is wrong with your approach: You ignore the possibility
    that your line length might exceed MAX. Unfortunately, there
    is no nice standard C way to do it right but you can use the
    public domain ggets()/fggets() of C.B. Falconer which is portable.

    How can that make problems?
    Suppose l1 is too long and can be decomposed into l1a and l1b;
    If we now have the fragments l1a and l1b somewhere in the second
    file, we could be lead to the conclusion that the files are
    identical even if they are not.

    In addition, you will not find lines which are only in the
    second file but not in the first one.

    Let us consider a line by line comparison without multiple lines
    and the points I raised above:


    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    /* Use the PD fggets() from http://cbfalconer.home.att.net/download/ */
    #include "ggets/ggets.h"

    void fileconv (FILE *conv_src, FILE *conv_dest);
    void filecomp (FILE *comp_src, FILE *conv_dest);

    int main (int argc, char **argv)
    {
    FILE *comp_src, *conv_src, *conv_dest;

    if (argc != 4) {
    fprintf(stderr, "Need THREE files (comp_src conv_src conv_dest)\n");
    exit(EXIT_FAILURE);
    }

    /* Conversion */
    conv_dest = fopen(argv[3], "w+");
    conv_src = fopen(argv[2], "r");
    if (!conv_src || !conv_dest) {
    fprintf(stderr, "Cannot open files for conversion\n" );
    (void)fclose(conv_dest);
    (void)fclose(conv_src);
    exit(EXIT_FAILURE);
    }
    fileconv(conv_src, conv_dest);
    (void)fclose(conv_src);

    /* Comparison */
    comp_src = fopen(argv[1], "r");
    if (comp_src == NULL) {
    fprintf(stderr, "Cannot open %s\n", argv[1]);
    exit(EXIT_FAILURE);
    }
    rewind(conv_dest);
    filecomp(comp_src, conv_dest);
    (void)fclose(comp_src);
    (void)fclose(conv_dest);

    return 0;
    }

    void fileconv (FILE *conv_src, FILE *conv_dest)
    {
    int c;
    while ((c = getc(conv_src)) != EOF) {
    if (c == '.') {
    c = '/';
    }

    if (EOF == putc(c, conv_dest )) {
    /* Handle error and return */
    }
    }
    }

    void filecomp (FILE *src1, FILE *src2)
    {
    char *line1, *line2;
    size_t line_count = 0;

    rewind(src1); /* Does not hurt but makes sure that */
    rewind(src2); /* we start from the beginning. */

    while (0 == fggets(&line1, src1)) {
    int is_equal=0;
    ++line_count;
    if (0 == fggets(&line2, src2)) {
    if (!strcmp(line1, line2))
    is_equal = 1;
    }
    else
    break;

    free(line1); /* Drawback of *ggets(): You have to free() */
    free(line2); /* yourself. */

    if (!is_equal)
    printf("line %lu of comp_src does not match "
    "normalized lines of conv_src\n",
    (unsigned long)line_count);
    }
    if (EOF == fggets(&line1, src1)
    && EOF == fggets(&line2, src2))
    printf("both files have ended\n");

    free(line1);
    free(line2);
    }

    Now, how can we give that at least the functionality you had?
    As I understand, you cared nothing about the position of the
    lines within the files.
    So: Read in all lines of src1, store them in an array a1 of
    strings; do the same for src2 (->array a2).
    Sort them, e.g. using strcmp() and qsort().
    If you are not interested in multiples, you can throw them
    out at sorting if you sort yourself or do it after sorting.
    Otherwise, go to the last of the multiple lines when
    going through the lines.

    In the following, I pretend there are no multiple lines:
    Now, proceed through the arrays and check all lines whether
    they are identical.
    BTW: You do not even to generate a separate converted file;
    when reading in the lines, do your "conversion" on the fly.

    What have we lost? The line numbers. Okay, let us use arrays
    a1, a2 of struct line { size_t line_no; char *line_content;}
    instead and pass the difference.

    Now, the last step: Before sorting the lines, sort every line's
    words. This can be, for example, done by splitting the line into
    an array of strings where every string is a word, sorting this
    array, and generating a string from it containing all the words,
    separated by a space.

    Just try it; the clc crowd will help you if you struggle with
    the details.


    Cheers
    Michael
    --
    E-Mail: Mine is an /at/ gmx /dot/ de address.
     
    Michael Mair, Jan 14, 2006
    #6
  7. Frost

    Frost Guest

    thanks for ur help u been very helpful,ur program is awsome but i am
    not getting the idea of what u asked me to do or how to do it
    i have basic knowledge of c and now im learning all these file
    handling,datastuctures,pointers in c so can u help me here agian if it
    is not to much trouble.
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    Now, how can we give that at least the functionality you had?
    As I understand, you cared nothing about the position of the
    lines within the files.
    So: Read in all lines of src1, store them in an array a1 of
    strings; do the same for src2 (->array a2).
    Sort them, e.g. using strcmp() and qsort().
    If you are not interested in multiples, you can throw them
    out at sorting if you sort yourself or do it after sorting.
    Otherwise, go to the last of the multiple lines when
    going through the lines.
    Now, the last step: Before sorting the lines, sort every line's
    words. This can be, for example, done by splitting the line into
    an array of strings where every string is a word, sorting this
    array, and generating a string from it containing all the words,
    separated by a space
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
     
    Frost, Jan 17, 2006
    #7
  8. Frost

    Frost Guest

    hello all thanks for all the replies but im still stuck on line by line
    word comparison part between two files, the lines may contain the same
    words in different places what i mean to say is contents of both the
    lines are same hence they should match can some one help me with such a
    program.
     
    Frost, Feb 10, 2006
    #8
  9. Frost wrote:
    > hello all thanks for all the replies but im still stuck on line by line
    > word comparison part between two files, the lines may contain the same
    > words in different places what i mean to say is contents of both the
    > lines are same hence they should match can some one help me with such a
    > program.


    I guess you want to say that these two lines are the same:

    foo bar baz
    bar foo baz

    One solution would be to create a sorted list of words in each of the
    two lines you're comparing. Then test whether two lists are equivalent.
    You may also have to decide whether case matters, and what constitutes
    a word.

    Possible reason for your previous post being ignored is that no-one
    here will give you lectures in basics that you can get from good
    textbooks. Look up all concepts and functions mentioned, study, try to
    use. If you encounter problems in the /last/ bit, come back here for
    help.

    In terms of general algorithms and programming methods, comp.programmer
    is a much better place to ask. It's not frowned upon here, either, but
    direct C-related questions are ofprimary interest.

    Also, please read (and heed) this, again:

    "If you want to post a followup via groups.google.com, don't use
    the broken "Reply" link at the bottom of the article. Click on
    "show options" at the top of the article, then click on the
    "Reply" at the bottom of the article headers." - Keith Thompson
    More details at: <http://cfaj.freeshell.org/google/>

    --
    BR, Vladimir
     
    Vladimir S. Oka, Feb 10, 2006
    #9
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. GenxLogic
    Replies:
    3
    Views:
    1,350
    andrewmcdonagh
    Dec 6, 2006
  2. Nick Matzke

    Comparing two book chapters (text files)

    Nick Matzke, Feb 5, 2009, in forum: Python
    Replies:
    1
    Views:
    298
    andrew cooke
    Feb 5, 2009
  3. ruds

    comparing two test files

    ruds, Dec 23, 2011, in forum: Java
    Replies:
    5
    Views:
    396
    Gene Wirchenko
    Dec 23, 2011
  4. Brajmohan S.

    Comparing a string word by word

    Brajmohan S., Jan 17, 2011, in forum: Ruby
    Replies:
    2
    Views:
    122
    Robert Klemme
    Jan 19, 2011
  5. Replies:
    3
    Views:
    139
    John W. Krahn
    Nov 28, 2007
Loading...

Share This Page