TF-IDF

Joined
Aug 19, 2021
Messages
3
Reaction score
1
An implementation of the TF-IDF algorithm in C++ to find information in a text file (myfile.txt) by asking a question to the program in natural language. If there are any calculation errors report them to me!
C++:
#include <iostream>
#include <map>
#include <string>
#include <sstream>
#include <fstream>
#include <cctype>
#include <unordered_set>
#include <algorithm>
#include <cctype>
#include <bits/stdc++.h>
#include <math.h>
using namespace std;

map<string, double> wordQueries;
vector<string> ret;
vector<double> lengthOfLine;

vector<string> split(const string &s)
{
    typedef string::size_type string_size;
    string_size i = 0;

    while (i != s.size())
    {
        while (i != s.size() && isspace(s[i]))
            ++i;
        string_size j = i;

        while (j != s.size() && !isspace(s[j]))
            j++;
        if (i != j)
        {
            ret.push_back(s.substr(i, j - i));
            i = j;
        }
    }

    return ret;
}

string Keyword(string token, string fileName, double tf)
{
    map<string, double> lineSelection;
    std::ifstream ifs(fileName);
    string line;
    while (getline(ifs, line))
    {
        if (line.find(token) != string::npos)
        {
            split(line);
            int N = ret.size();

            double tfnorm = wordQueries.find(token)->second;
            /*
            N = total number of words in the collection
            df = document frequency of word w
            idf = log[(N - df + 0.5)/(df + 0.5)]
            */
            double varidf = log((N - tf + 0.5) / (tf + 0.5));

            /*
            Combining TF and IDF
                        Just multiply the idf and the tf normalizations. The idf
                        normalization is global, depending on the distribution of
                        individual words in the collection. The tf normalization is local,
                        and its purpose is to dampen the effect of words that appear
                        too many times.
            */
            double normalization = tfnorm * varidf;

            lineSelection.insert(std::pair<string, double> (line, normalization));
        }
    }

    std::map<string, double>::iterator best = std::max_element(lineSelection.begin(), lineSelection.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string googLine = best->first;
    return googLine;
}

std::string remove_stop_words(const std::string &src, const std::unordered_set<std::string > &stops)
{
    std::string retval;
    std::istringstream strm(src);
    std::string word;
    while (strm >> word)
    {
        if (!stops.count(word))
            retval += word + " ";
    }

    if (!retval.empty())
        retval.pop_back();
    return retval;
}

int main()
{
    string insertit;
    double tf;
    std::string fileName = "myfile.txt";
    std::ifstream inFile(fileName);

    int wordCount = 0;
    int lineNum = 0;

    std::unordered_map<std::string, int> words;

    std::string str;
    cout << "Please enter your question " << endl;
    getline(cin, insertit);

    std::unordered_set<std::string > stops = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because",
        "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't",
        "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here",
        "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
        "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
        "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
        "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
        "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while",
        "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" };
    insertit = remove_stop_words(insertit, stops);

    /*
    In computing, stop words are words that are filtered out before or after the natural language data (text)
    are processed. While stop words typically refers to the most common words in a language, all-natural language
    processing tools don't use a single universal list of stop words.
    */
    split(insertit);
    int M = ret.size();

    while (getline(inFile, str))
    {
        std::stringstream ss(str);
        while (ss)
        {
            std::string s;
            ss >> s;
            std::transform(s.begin(), s.end(), s.begin(),
            [](unsigned char c)
                {
                    return std::tolower(c);
    });
            s.erase(std::remove_if(s.begin(), s.end(),
                [](unsigned char c)
                    {
                        return std::isalpha(c) == 0;
}),
                s.end());
            if (!s.empty())
            {
                ++wordCount;
                ++words[s];
            }
        }

        ++lineNum;
        int lineSize = str.length();
        lengthOfLine.push_back(lineSize);
    }

    auto n = lengthOfLine.size();
    double avg_doc_len = 0.0f;
    if (n != 0)
    {
        avg_doc_len = accumulate(lengthOfLine.begin(), lengthOfLine.end(), 0.0) / n;
    }

    double doclen_correction = 0.75 *wordCount / avg_doc_len + 0.25;

    for (auto &pair: words)
    {
        for (int j = 0; j < M; j++)
        {
            if (pair.first == ret[j])
            {
                tf = pair.second / wordCount;
                double tfnorm = tf / (tf + doclen_correction);
                /*
                tf = number of times word appears in a document, the term frequency
                doclen = number of total words (including duplicates) in a document, the document length
                avg_doc_len = average of all document lengths in the collection
                doclen_correction = 0.75*doclen/avg_doclen + 0.25
                tfnorm= tf/(tf + doclen_correction)
                */
                wordQueries.insert(std::pair<string, double> (pair.first, tfnorm));
            }
        }
    }

    std::map<string, double>::iterator best = std::max_element(wordQueries.begin(), wordQueries.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string token = best->first;
    cout << "Let's talk about \"" << token << "\"... " << endl;
    cout << Keyword(token, fileName, tf) << endl;
    return 0;
}
 
Joined
Aug 31, 2021
Messages
1
Reaction score
0
You must replace the line :
std::unordered_map<std::string, int> words;
by this line
std::unordered_map<std::string, double> words;
otherwise tf = 0 and tfnorm too. Here is the correct code:

C++:
#include <iostream>
#include <map>
#include <string>
#include <sstream>
#include <fstream>
#include <cctype>
#include <unordered_set>
#include <algorithm>
#include <cctype>
#include <bits/stdc++.h>
#include <math.h>
using namespace std;

map<string, double> wordQueries;
vector<string> ret;
vector<double> lengthOfLine;

vector<string> split(const string &s)
{
    typedef string::size_type string_size;
    string_size i = 0;

    while (i != s.size())
    {
        while (i != s.size() && isspace(s[i]))
            ++i;
        string_size j = i;

        while (j != s.size() && !isspace(s[j]))
            j++;
        if (i != j)
        {
            ret.push_back(s.substr(i, j - i));
            i = j;
        }
    }

    return ret;
}

string Keyword(string token, string fileName, double tf)
{
    map<string, double> lineSelection;
    std::ifstream ifs(fileName);
    string line;
    while (getline(ifs, line))
    {
        if (line.find(token) != string::npos)
        {
            split(line);
            int N = ret.size();

            double tfnorm = wordQueries.find(token)->second;
            /*
            N = total number of words in the collection
            df = document frequency of word w
            idf = log[(N - df + 0.5)/(df + 0.5)]
            */
            double varidf = log((N - tf + 0.5) / (tf + 0.5));

            /*
            Combining TF and IDF
                        Just multiply the idf and the tf normalizations. The idf
                        normalization is global, depending on the distribution of
                        individual words in the collection. The tf normalization is local,
                        and its purpose is to dampen the effect of words that appear
                        too many times.
            */
            double normalization = tfnorm * varidf;

            lineSelection.insert(std::pair<string, double> (line, normalization));
        }
    }

    std::map<string, double>::iterator best = std::max_element(lineSelection.begin(), lineSelection.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string googLine = best->first;
    return googLine;
}

std::string remove_stop_words(const std::string &src, const std::unordered_set<std::string > &stops)
{
    std::string retval;
    std::istringstream strm(src);
    std::string word;
    while (strm >> word)
    {
        if (!stops.count(word))
            retval += word + " ";
    }

    if (!retval.empty())
        retval.pop_back();
    return retval;
}

int main()
{
    string insertit;
    
    double doclen_correction;
    std::string fileName = "myfile.txt";
    std::ifstream inFile(fileName);

    int wordCount = 0.0f;
    int lineNum = 0.0f;
    int termOccur;
    double tf = 0.0f;
    std::unordered_map<std::string, double> words;

    std::string str;
    cout << "Please enter your question " << endl;
    getline(cin, insertit);
    
    std::unordered_set<std::string > stops = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because",
        "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't",
        "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here",
        "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
        "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
        "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
        "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
        "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while",
        "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" };
    insertit = remove_stop_words(insertit, stops);


    /*
    In computing, stop words are words that are filtered out before or after the natural language data (text)
    are processed. While stop words typically refers to the most common words in a language, all-natural language
    processing tools don't use a single universal list of stop words.
    */
    split(insertit);
    

    while (getline(inFile, str))
    {
        std::stringstream ss(str);
        while (ss)
        {
            std::string s;
            ss >> s;
            std::transform(s.begin(), s.end(), s.begin(),
            [](unsigned char c)
                {
                    return std::tolower(c);
    });
            s.erase(std::remove_if(s.begin(), s.end(),
                [](unsigned char c)
                    {
                        return std::isalpha(c) == 0;
}),
                s.end());
            if (!s.empty())
            {
                wordCount = wordCount+1;
                ++words[s];
            }
        }

        ++lineNum;
        int lineSize = str.length();
        lengthOfLine.push_back(lineSize);
    }

    auto n = lengthOfLine.size();
    double avg_doc_len = 0.0f;
    if (n != 0)
    {
        avg_doc_len = accumulate(lengthOfLine.begin(), lengthOfLine.end(), 0.0) / n;
    }

    doclen_correction = 0.75 *wordCount / avg_doc_len + 0.25;
    
    for (auto &pair: words)
    {
        int M = ret.size();
        for (int j = 0; j < M; j++)
        {
            if (pair.first == ret[j])
            {
                
                
                tf = pair.second/wordCount;
                
                double tfnorm = tf/(tf + doclen_correction);
                
                /*
                tf = number of times word appears in a document, the term frequency
                doclen = number of total words (including duplicates) in a document, the document length
                avg_doc_len = average of all document lengths in the collection
                doclen_correction = 0.75*doclen/avg_doclen + 0.25
                tfnorm= tf/(tf + doclen_correction)
                */
                wordQueries.insert(std::pair<string, double> (pair.first, tfnorm));
            }
            else
            {
                continue;
            }
        }
    }

    std::map<string, double>::iterator best = std::max_element(wordQueries.begin(), wordQueries.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string token = best->first;
    cout << "Let's talk about \"" << token << "\"... " << endl;
    cout << Keyword(token, fileName, tf) << endl;
    
}
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,744
Messages
2,569,483
Members
44,901
Latest member
Noble71S45

Latest Threads

Top