from bs4 import BeautifulSoup as bs from datetime import datetime import urllib.request import re from dateutil.rrule import rrule, MONTHLY import numpy as np from numpy.polynomial import Polynomial from random import randint from flask import Flask, abort, render_template from functools import cache,partial import feedparser #import multiprocessing import time import queue from concurrent.futures import ThreadPoolExecutor #mp = multiprocessing.get_context('spawn') parse = lambda l: bs(urllib.request.urlopen(l), 'html.parser') @cache def total(l): r = re.search('total of ([0-9]*)', parse(l).find('small').text) if r is None: return 0 return int(r.group(1)) @cache def get_links(category="cs"): link = ("https://export.arxiv.org/list/" + category + "/{}{}").format first = parse(link('00', '00')) start = datetime.strptime( first.find('a', href=re.compile('/list/' + category + '/[0-9]{4}')).text, '%y%m') return [ link(t.strftime('%y'), t.strftime('%m')) for t in rrule(MONTHLY, dtstart=start, until=datetime.now()) ] @cache def get_probability_distribution(topic): links = get_links(topic) x = [0, len(links) // 2, round(len(links) // 1.5), len(links) - 2] with ThreadPoolExecutor(max_workers=len(x)+1) as exc: y = exc.map(total, [links[x] for x in x]) last = exc.submit(total, links[-1]) ly = np.log(list(y)) p = Polynomial.fit(x, ly, deg=3) lengths = np.exp(np.vectorize(p)(range(len(links)))) lengths[-1] = last.result() # account for current month having few submissions return lengths / sum(lengths) taxonomy = [ 'cs', 'math', 'physics', 'econ', 'eess', 'astro-ph', 'cond-mat', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nlin', 'nucl-ex', 'nucl-th', 'quant-ph', 'q-bio', 'q-fin', 'stat' ] app = Flask(__name__) @app.route('/favicon.ico') def favicon(): return abort(404) preload_topics = ['cs', 'math', 'physics', 'quant-ph', 'econ'] preloaded_queues = {k:queue.Queue(10) for k in preload_topics} def preload_random(topic): p = get_probability_distribution(topic) l = get_links(topic) d = np.random.choice(l, p=p) t = total(d) if t != 0: paper = parse(d + f'?skip={randint(0,t-1)}&show=5').find('a', title="Abstract").text[6:] link = 'https://arxiv.org/abs/' + paper feed = feedparser.parse(f'http://export.arxiv.org/api/query?id_list={paper}') entry = feed['entries'][0] preloaded_queues[topic].put(entry) @app.route('/', methods=['GET']) def random(topic): if topic not in taxonomy: return abort(500) if topic in preload_topics: if not preloaded_queues[topic].full(): exc = ThreadPoolExecutor(max_workers=2) for _ in range(2): exc.submit(preload_random, topic) exc.shutdown(wait=False) if not preloaded_queues[topic].empty(): entry = preloaded_queues[topic].get() return render_template('topic.html', description=entry['description'], title=entry['title'], link=entry['link'], authors=entry['authors'], published=time.strftime('%d %b %Y', entry['published_parsed'])) l = get_links(topic) p = get_probability_distribution(topic) d = np.random.choice(l, p=p) t = total(d) if t == 0: return random(topic) paper = parse(d + f'?skip={randint(0,t-1)}&show=5').find( 'a', title="Abstract").text[6:] link = 'https://arxiv.org/abs/' + paper feed = feedparser.parse( f'http://export.arxiv.org/api/query?id_list={paper}') entry = feed['entries'][0] return render_template('topic.html', description=entry['description'], title=entry['title'], link=link, authors=entry['authors'], published=time.strftime('%d %b %Y', entry['published_parsed'])) @app.route('/') def index(): return render_template('index.html', taxonomy=taxonomy) #@app.before_first_request #def before_first_request(): # # preload common topics # #with mp.Pool(processes=len(preload_topics)) as pool: #A # pool.map(get_probability_distribution, preload_topics) # with ThreadPoolExecutor(max_workers=len(preload_topics)) as exc: # for t in preload_topics: # exc.submit(preload_random, t) if __name__ == '__main__': app.run(host='0.0.0.0', port=8080)