An implementation of the TF-IDF algorithm in C++ to find information in a text file (myfile.txt) by asking a question to the program in natural language. If there are any calculation errors report them to me!
C++:
#include <iostream>
#include <map>
#include <string>
#include <sstream>
#include <fstream>
#include <cctype>
#include <unordered_set>
#include <algorithm>
#include <cctype>
#include <bits/stdc++.h>
#include <math.h>
using namespace std;
map<string, double> wordQueries;
vector<string> ret;
vector<double> lengthOfLine;
vector<string> split(const string &s)
{
typedef string::size_type string_size;
string_size i = 0;
while (i != s.size())
{
while (i != s.size() && isspace(s[i]))
++i;
string_size j = i;
while (j != s.size() && !isspace(s[j]))
j++;
if (i != j)
{
ret.push_back(s.substr(i, j - i));
i = j;
}
}
return ret;
}
string Keyword(string token, string fileName, double tf)
{
map<string, double> lineSelection;
std::ifstream ifs(fileName);
string line;
while (getline(ifs, line))
{
if (line.find(token) != string::npos)
{
split(line);
int N = ret.size();
double tfnorm = wordQueries.find(token)->second;
/*
N = total number of words in the collection
df = document frequency of word w
idf = log[(N - df + 0.5)/(df + 0.5)]
*/
double varidf = log((N - tf + 0.5) / (tf + 0.5));
/*
Combining TF and IDF
Just multiply the idf and the tf normalizations. The idf
normalization is global, depending on the distribution of
individual words in the collection. The tf normalization is local,
and its purpose is to dampen the effect of words that appear
too many times.
*/
double normalization = tfnorm * varidf;
lineSelection.insert(std::pair<string, double> (line, normalization));
}
}
std::map<string, double>::iterator best = std::max_element(lineSelection.begin(), lineSelection.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
{
return a.second < b.second;
});
string googLine = best->first;
return googLine;
}
std::string remove_stop_words(const std::string &src, const std::unordered_set<std::string > &stops)
{
std::string retval;
std::istringstream strm(src);
std::string word;
while (strm >> word)
{
if (!stops.count(word))
retval += word + " ";
}
if (!retval.empty())
retval.pop_back();
return retval;
}
int main()
{
string insertit;
double tf;
std::string fileName = "myfile.txt";
std::ifstream inFile(fileName);
int wordCount = 0;
int lineNum = 0;
std::unordered_map<std::string, int> words;
std::string str;
cout << "Please enter your question " << endl;
getline(cin, insertit);
std::unordered_set<std::string > stops = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because",
"been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't",
"down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here",
"here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
"let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
"over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
"them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
"up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while",
"who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" };
insertit = remove_stop_words(insertit, stops);
/*
In computing, stop words are words that are filtered out before or after the natural language data (text)
are processed. While stop words typically refers to the most common words in a language, all-natural language
processing tools don't use a single universal list of stop words.
*/
split(insertit);
int M = ret.size();
while (getline(inFile, str))
{
std::stringstream ss(str);
while (ss)
{
std::string s;
ss >> s;
std::transform(s.begin(), s.end(), s.begin(),
[](unsigned char c)
{
return std::tolower(c);
});
s.erase(std::remove_if(s.begin(), s.end(),
[](unsigned char c)
{
return std::isalpha(c) == 0;
}),
s.end());
if (!s.empty())
{
++wordCount;
++words[s];
}
}
++lineNum;
int lineSize = str.length();
lengthOfLine.push_back(lineSize);
}
auto n = lengthOfLine.size();
double avg_doc_len = 0.0f;
if (n != 0)
{
avg_doc_len = accumulate(lengthOfLine.begin(), lengthOfLine.end(), 0.0) / n;
}
double doclen_correction = 0.75 *wordCount / avg_doc_len + 0.25;
for (auto &pair: words)
{
for (int j = 0; j < M; j++)
{
if (pair.first == ret[j])
{
tf = pair.second / wordCount;
double tfnorm = tf / (tf + doclen_correction);
/*
tf = number of times word appears in a document, the term frequency
doclen = number of total words (including duplicates) in a document, the document length
avg_doc_len = average of all document lengths in the collection
doclen_correction = 0.75*doclen/avg_doclen + 0.25
tfnorm= tf/(tf + doclen_correction)
*/
wordQueries.insert(std::pair<string, double> (pair.first, tfnorm));
}
}
}
std::map<string, double>::iterator best = std::max_element(wordQueries.begin(), wordQueries.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
{
return a.second < b.second;
});
string token = best->first;
cout << "Let's talk about \"" << token << "\"... " << endl;
cout << Keyword(token, fileName, tf) << endl;
return 0;
}