TF-IDF

nemauvel · Aug 19, 2021

An implementation of the TF-IDF algorithm in C++ to find information in a text file (myfile.txt) by asking a question to the program in natural language. If there are any calculation errors report them to me!

C++:

#include <iostream>
#include <map>
#include <string>
#include <sstream>
#include <fstream>
#include <cctype>
#include <unordered_set>
#include <algorithm>
#include <cctype>
#include <bits/stdc++.h>
#include <math.h>
using namespace std;

map<string, double> wordQueries;
vector<string> ret;
vector<double> lengthOfLine;

vector<string> split(const string &s)
{
    typedef string::size_type string_size;
    string_size i = 0;

    while (i != s.size())
    {
        while (i != s.size() && isspace(s[i]))
            ++i;
        string_size j = i;

        while (j != s.size() && !isspace(s[j]))
            j++;
        if (i != j)
        {
            ret.push_back(s.substr(i, j - i));
            i = j;
        }
    }

    return ret;
}

string Keyword(string token, string fileName, double tf)
{
    map<string, double> lineSelection;
    std::ifstream ifs(fileName);
    string line;
    while (getline(ifs, line))
    {
        if (line.find(token) != string::npos)
        {
            split(line);
            int N = ret.size();

            double tfnorm = wordQueries.find(token)->second;
            /*
            N = total number of words in the collection
            df = document frequency of word w
            idf = log[(N - df + 0.5)/(df + 0.5)]
            */
            double varidf = log((N - tf + 0.5) / (tf + 0.5));

            /*
            Combining TF and IDF
                        Just multiply the idf and the tf normalizations. The idf
                        normalization is global, depending on the distribution of
                        individual words in the collection. The tf normalization is local,
                        and its purpose is to dampen the effect of words that appear
                        too many times.
            */
            double normalization = tfnorm * varidf;

            lineSelection.insert(std::pair<string, double> (line, normalization));
        }
    }

    std::map<string, double>::iterator best = std::max_element(lineSelection.begin(), lineSelection.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string googLine = best->first;
    return googLine;
}

std::string remove_stop_words(const std::string &src, const std::unordered_set<std::string > &stops)
{
    std::string retval;
    std::istringstream strm(src);
    std::string word;
    while (strm >> word)
    {
        if (!stops.count(word))
            retval += word + " ";
    }

    if (!retval.empty())
        retval.pop_back();
    return retval;
}

int main()
{
    string insertit;
    double tf;
    std::string fileName = "myfile.txt";
    std::ifstream inFile(fileName);

    int wordCount = 0;
    int lineNum = 0;

    std::unordered_map<std::string, int> words;

    std::string str;
    cout << "Please enter your question " << endl;
    getline(cin, insertit);

    std::unordered_set<std::string > stops = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because",
        "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't",
        "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here",
        "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
        "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
        "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
        "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
        "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while",
        "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" };
    insertit = remove_stop_words(insertit, stops);

    /*
    In computing, stop words are words that are filtered out before or after the natural language data (text)
    are processed. While stop words typically refers to the most common words in a language, all-natural language
    processing tools don't use a single universal list of stop words.
    */
    split(insertit);
    int M = ret.size();

    while (getline(inFile, str))
    {
        std::stringstream ss(str);
        while (ss)
        {
            std::string s;
            ss >> s;
            std::transform(s.begin(), s.end(), s.begin(),
            [](unsigned char c)
                {
                    return std::tolower(c);
    });
            s.erase(std::remove_if(s.begin(), s.end(),
                [](unsigned char c)
                    {
                        return std::isalpha(c) == 0;
}),
                s.end());
            if (!s.empty())
            {
                ++wordCount;
                ++words[s];
            }
        }

        ++lineNum;
        int lineSize = str.length();
        lengthOfLine.push_back(lineSize);
    }

    auto n = lengthOfLine.size();
    double avg_doc_len = 0.0f;
    if (n != 0)
    {
        avg_doc_len = accumulate(lengthOfLine.begin(), lengthOfLine.end(), 0.0) / n;
    }

    double doclen_correction = 0.75 *wordCount / avg_doc_len + 0.25;

    for (auto &pair: words)
    {
        for (int j = 0; j < M; j++)
        {
            if (pair.first == ret[j])
            {
                tf = pair.second / wordCount;
                double tfnorm = tf / (tf + doclen_correction);
                /*
                tf = number of times word appears in a document, the term frequency
                doclen = number of total words (including duplicates) in a document, the document length
                avg_doc_len = average of all document lengths in the collection
                doclen_correction = 0.75*doclen/avg_doclen + 0.25
                tfnorm= tf/(tf + doclen_correction)
                */
                wordQueries.insert(std::pair<string, double> (pair.first, tfnorm));
            }
        }
    }

    std::map<string, double>::iterator best = std::max_element(wordQueries.begin(), wordQueries.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string token = best->first;
    cout << "Let's talk about \"" << token << "\"... " << endl;
    cout << Keyword(token, fileName, tf) << endl;
    return 0;
}

satrabeff · Aug 31, 2021

You must replace the line :
std::unordered_map<std::string, int> words;
by this line
std::unordered_map<std::string, double> words;
otherwise tf = 0 and tfnorm too. Here is the correct code:

C++:

#include <iostream>
#include <map>
#include <string>
#include <sstream>
#include <fstream>
#include <cctype>
#include <unordered_set>
#include <algorithm>
#include <cctype>
#include <bits/stdc++.h>
#include <math.h>
using namespace std;

map<string, double> wordQueries;
vector<string> ret;
vector<double> lengthOfLine;

vector<string> split(const string &s)
{
    typedef string::size_type string_size;
    string_size i = 0;

    while (i != s.size())
    {
        while (i != s.size() && isspace(s[i]))
            ++i;
        string_size j = i;

        while (j != s.size() && !isspace(s[j]))
            j++;
        if (i != j)
        {
            ret.push_back(s.substr(i, j - i));
            i = j;
        }
    }

    return ret;
}

string Keyword(string token, string fileName, double tf)
{
    map<string, double> lineSelection;
    std::ifstream ifs(fileName);
    string line;
    while (getline(ifs, line))
    {
        if (line.find(token) != string::npos)
        {
            split(line);
            int N = ret.size();

            double tfnorm = wordQueries.find(token)->second;
            /*
            N = total number of words in the collection
            df = document frequency of word w
            idf = log[(N - df + 0.5)/(df + 0.5)]
            */
            double varidf = log((N - tf + 0.5) / (tf + 0.5));

            /*
            Combining TF and IDF
                        Just multiply the idf and the tf normalizations. The idf
                        normalization is global, depending on the distribution of
                        individual words in the collection. The tf normalization is local,
                        and its purpose is to dampen the effect of words that appear
                        too many times.
            */
            double normalization = tfnorm * varidf;

            lineSelection.insert(std::pair<string, double> (line, normalization));
        }
    }

    std::map<string, double>::iterator best = std::max_element(lineSelection.begin(), lineSelection.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string googLine = best->first;
    return googLine;
}

std::string remove_stop_words(const std::string &src, const std::unordered_set<std::string > &stops)
{
    std::string retval;
    std::istringstream strm(src);
    std::string word;
    while (strm >> word)
    {
        if (!stops.count(word))
            retval += word + " ";
    }

    if (!retval.empty())
        retval.pop_back();
    return retval;
}

int main()
{
    string insertit;
    
    double doclen_correction;
    std::string fileName = "myfile.txt";
    std::ifstream inFile(fileName);

    int wordCount = 0.0f;
    int lineNum = 0.0f;
    int termOccur;
    double tf = 0.0f;
    std::unordered_map<std::string, double> words;

    std::string str;
    cout << "Please enter your question " << endl;
    getline(cin, insertit);
    
    std::unordered_set<std::string > stops = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because",
        "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't",
        "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here",
        "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
        "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
        "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
        "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
        "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while",
        "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" };
    insertit = remove_stop_words(insertit, stops);


    /*
    In computing, stop words are words that are filtered out before or after the natural language data (text)
    are processed. While stop words typically refers to the most common words in a language, all-natural language
    processing tools don't use a single universal list of stop words.
    */
    split(insertit);
    

    while (getline(inFile, str))
    {
        std::stringstream ss(str);
        while (ss)
        {
            std::string s;
            ss >> s;
            std::transform(s.begin(), s.end(), s.begin(),
            [](unsigned char c)
                {
                    return std::tolower(c);
    });
            s.erase(std::remove_if(s.begin(), s.end(),
                [](unsigned char c)
                    {
                        return std::isalpha(c) == 0;
}),
                s.end());
            if (!s.empty())
            {
                wordCount = wordCount+1;
                ++words[s];
            }
        }

        ++lineNum;
        int lineSize = str.length();
        lengthOfLine.push_back(lineSize);
    }

    auto n = lengthOfLine.size();
    double avg_doc_len = 0.0f;
    if (n != 0)
    {
        avg_doc_len = accumulate(lengthOfLine.begin(), lengthOfLine.end(), 0.0) / n;
    }

    doclen_correction = 0.75 *wordCount / avg_doc_len + 0.25;
    
    for (auto &pair: words)
    {
        int M = ret.size();
        for (int j = 0; j < M; j++)
        {
            if (pair.first == ret[j])
            {
                
                
                tf = pair.second/wordCount;
                
                double tfnorm = tf/(tf + doclen_correction);
                
                /*
                tf = number of times word appears in a document, the term frequency
                doclen = number of total words (including duplicates) in a document, the document length
                avg_doc_len = average of all document lengths in the collection
                doclen_correction = 0.75*doclen/avg_doclen + 0.25
                tfnorm= tf/(tf + doclen_correction)
                */
                wordQueries.insert(std::pair<string, double> (pair.first, tfnorm));
            }
            else
            {
                continue;
            }
        }
    }

    std::map<string, double>::iterator best = std::max_element(wordQueries.begin(), wordQueries.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string token = best->first;
    cout << "Let's talk about \"" << token << "\"... " << endl;
    cout << Keyword(token, fileName, tf) << endl;
    
}

Crossword	14	May 13, 2020
Drawing missing in bitmap in a pure C win32 program	4	Jun 3, 2023
CIN Input #2 gets skipped, I don't understand why.	1	Feb 9, 2023
Decompressed bitmap image doesn't properly render when using WinGDI	2	Jun 14, 2024
Lexical Analysis on C++	1	Oct 31, 2023
Crossword	2	May 11, 2020
How can I fix my pattern coding error in c++	0	Mar 19, 2023
Boomer trying to learn coding in C and C++	6	Dec 16, 2022

TF-IDF

nemauvel

satrabeff

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads