from bs4 import BeautifulSoup as bs from datetime import datetime import urllib.request import re from dateutil.rrule import rrule, MONTHLY import numpy as np from numpy.polynomial import Polynomial from random import randint from flask import Flask, abort, render_template from functools import cache import feedparser import multiprocessing parse = lambda l: bs(urllib.request.urlopen(l), 'html.parser') @cache def total(l): r = re.search('total of ([0-9]*)', parse(l).find('small').text) if r is None: return 0 return int(r.group(1)) @cache def get_links(category="cs"): link = ("https://export.arxiv.org/list/"+category+"/{}{}").format first = parse(link('00','00')) start = datetime.strptime(first.find('a', href=re.compile('/list/'+category+'/[0-9]{4}')).text, '%y%m') return [link(t.strftime('%y'),t.strftime('%m')) for t in rrule(MONTHLY, dtstart=start, until=datetime.now())] @cache def get_probability_distribution(topic): links = get_links(topic) x = [0, len(links)//2, round(len(links)//1.5), len(links)-2] with multiprocessing.Pool(processes=5) as pool: ts = pool.map(total, [links[x] for x in x]+[links[-1]]) y = ts[:-1] ly = np.log(y) p = Polynomial.fit(x, ly, deg=3) lengths = np.exp(np.vectorize(p)(range(len(links)))) lengths[-1] = total(links[-1]) # account for current month having few submissions return lengths / sum(lengths) taxonomy = ['cs', 'econ', 'eess', 'math', 'astro-ph', 'cond-mat', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nlin', 'nucl-ex', 'nucl-th', 'physics', 'quant-ph', 'q-bio', 'q-fin', 'stat'] app = Flask(__name__) @app.route('/favicon.ico') def favicon(): return abort(404) @app.route('/', methods=['GET']) def random(topic): l = get_links(topic) p = get_probability_distribution(topic) d = np.random.choice(l, p=p) t = total(d) if t == 0: return random(topic) paper = parse(d+f'?skip={randint(0,t-1)}&show=5').find('a', title="Abstract").text[6:] link = 'https://arxiv.org/abs/'+paper feed = feedparser.parse(f'http://export.arxiv.org/api/query?id_list={paper}') entry = feed['entries'][0] return render_template('topic.html', description=entry['description'], title=entry['title'], link=link) @app.route('/') def index(): return render_template('index.html', taxonomy=taxonomy) if __name__ == '__main__': app.run(host='0.0.0.0', port=8080)