Crawling

Joined
Jun 12, 2020
Messages
12
Reaction score
0
A way to explore wikipedia and store the results in a database. A function allows to search in the keywords, but the ideal would be to search in the end of the url to search in the title of the article (see on this page : https://stackoverflow.com/questions...a-url-string-up-into-separate-parts-in-python) . This code is made by getting pieces of code written by other people, I just put them together.
Python:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import sqlite3
import base64
import subprocess
import platform
import heapq
import time
import socks
import socket
import random
from random import choice

import sys
import math
import httpx
from gensim.summarization.textcleaner import split_sentences
from rake_nltk import Rake
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
import sys
from bs4 import BeautifulSoup
import requests
import nltk
from gensim.summarization.summarizer import summarize
import csv
import io
from googletrans import Translator
import numpy as np
import string
# import urllib.request
import urllib.request as urllib2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_extraction.text import HashingVectorizer
import colorama
from colorama import init
from colorama import Fore, Back, Style
from paraphraser import paraphrase
import subprocess
import platform
import heapq
from googlesearch import search

init()
timeout = httpx.Timeout(5)


conn = sqlite3.connect('crawled.db')
c = conn.cursor()


def listToString(s):

    # initialize an empty string
    str1 = " "

    # return string
    return (str1.join(s))



desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']


def random_headers():
    return {'User-Agent': choice(desktop_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}





logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

def remove_text_inside_brackets(text, brackets="[]"):
    count = [0] * (len(brackets) // 2)  # count open/close brackets
    saved_chars = []
    for character in text:
        for i, b in enumerate(brackets):
            if character == b:  # found bracket
                kind, is_close = divmod(i, 2)
                count[kind] += (-1)**is_close  # `+1`: open, `-1`: close
                if count[kind] < 0:  # unbalanced bracket
                    count[kind] = 0  # keep it
                else:  # found bracket to remove
                    break
        else:  # character is not a [balanced] bracket
            if not any(count):  # outside brackets
                saved_chars.append(character)
    return ''.join(saved_chars)

class Crawler:

    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls

    def download_url(self, url):

        proxies = {'http': 'socks5://localhost:9150', 'https': 'socks5://localhost:9150'}

        res = requests.get(url, proxies=proxies,headers=random_headers())
        pagetext = res.text

        wiki = BeautifulSoup(pagetext, 'html.parser')
        for l in wiki.select('p'):
            machine_response = l.getText()
            clear_response = remove_text_inside_brackets(machine_response)

        r = Rake()

        r.extract_keywords_from_text(clear_response)

        ranked_phrases = r.get_ranked_phrases()[0:20]
        vectorizer = TfidfVectorizer(
            sublinear_tf=True, encoding='latin-1', stop_words='english')

        X = vectorizer.fit_transform(ranked_phrases)

        true_k = 1
        km = KMeans(n_clusters=true_k, init='k-means++',
                    max_iter=100, n_init=1, random_state=1)
        y = km.fit(X)

        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        terms = vectorizer.get_feature_names()
        listkwd = []
        for i in range(true_k):
            for ind in order_centroids[i, :10]:
                test = ' %s' % terms[ind]
                listkwd.append(test)

        newkeyword = listToString(listkwd)
        encodedBytes = base64.b64encode(url.encode("utf-8"))
        encodedStr = str(encodedBytes, "utf-8")
        encodedBot = base64.b64encode(clear_response.encode("utf-8"))
        encodedbot = str(encodedBot, "utf-8")
        c.execute("insert into crawl_copy (url, data, keyword) values (?, ?, ?)",
                  (encodedStr, encodedbot, newkeyword))
        conn.commit()
        return pagetext

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)


if __name__ == '__main__':
    Crawler(urls=['https://en.wikipedia.org/wiki/']).run()
 
Last edited:
Joined
Jun 12, 2020
Messages
12
Reaction score
0
The search and display function

Python:
import math
import httpx
from gensim.summarization.textcleaner import split_sentences
from rake_nltk import Rake
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
import sys
from bs4 import BeautifulSoup
import requests
import heapq
import nltk
from gensim.summarization.summarizer import summarize
import csv
import time
import socks
import socket
import random
from random import choice
import io
from googletrans import Translator
import numpy as np
import string
import urllib.request
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import colorama
from colorama import init
from colorama import Fore, Back, Style
from paraphraser import paraphrase
import sqlite3
import base64
import nltk.data
import sys
import pandas as pd

#https://stackoverflow.com/questions/50063058/sqlite3-create-function-regexp-with-python
def regexp(expr, item):
    reg = re.compile(expr)
    return reg.search(item) is not None

conn = sqlite3.connect("crawled.db")
conn.create_function("REGEXP", 2, regexp)
c = conn.cursor()
#search_term = str(input())
search_term = str(input('Enter your search term here: '))
#c.execute('select  id, url, data, keyword from crawl_copy')
c.execute('SELECT id, url, data, keyword FROM crawl_copy WHERE keyword REGEXP ?',[search_term])
records = c.fetchall()

for record in records:
    ask = record[1]
    ask = base64.b64decode(ask)
    ask=str(ask,'utf-8')
    answer = record[2]
    answer = base64.b64decode(answer)
    answer=str(answer,'utf-8')
    print("id : ", record[0])
    print("url : ", ask)
    print("text : ", answer)
    print("keyword : ", record[3])
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,769
Messages
2,569,582
Members
45,059
Latest member
cryptoseoagencies

Latest Threads

Top