app.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

from bs4 import BeautifulSoup as bs
from datetime import datetime
import urllib.request
import re
from dateutil.rrule import rrule, MONTHLY
import numpy as np
from numpy.polynomial import Polynomial
from random import randint
from flask import Flask, abort, render_template
from functools import cache,partial
import feedparser
#import multiprocessing
import time
import queue
from concurrent.futures import ThreadPoolExecutor

#mp = multiprocessing.get_context('spawn')

parse = lambda l: bs(urllib.request.urlopen(l), 'html.parser')


@cache
def total(l):
    r = re.search('total of ([0-9]*)', parse(l).find('small').text)
    if r is None:
        return 0
    return int(r.group(1))


@cache
def get_links(category="cs"):
    link = ("https://export.arxiv.org/list/" + category + "/{}{}").format

    first = parse(link('00', '00'))
    start = datetime.strptime(
        first.find('a',
                   href=re.compile('/list/' + category + '/[0-9]{4}')).text,
        '%y%m')
    return [
        link(t.strftime('%y'), t.strftime('%m'))
        for t in rrule(MONTHLY, dtstart=start, until=datetime.now())
    ]


@cache
def get_probability_distribution(topic):
    links = get_links(topic)
    x = [0, len(links) // 2, round(len(links) // 1.5), len(links) - 2]
    with ThreadPoolExecutor(max_workers=len(x)+1) as exc:
        y = exc.map(total, [links[x] for x in x])
        last = exc.submit(total, links[-1])
    ly = np.log(list(y))

    p = Polynomial.fit(x, ly, deg=3)
    lengths = np.exp(np.vectorize(p)(range(len(links))))
    lengths[-1] = last.result()  # account for current month having few submissions
    return lengths / sum(lengths)


taxonomy = [
    'cs', 'math', 'physics', 'econ', 'eess',  'astro-ph', 'cond-mat', 'gr-qc', 'hep-ex',
    'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nlin', 'nucl-ex', 'nucl-th', 'quant-ph', 'q-bio', 'q-fin', 'stat'
]

app = Flask(__name__)


@app.route('/favicon.ico')
def favicon():
    return abort(404)

preload_topics = ['cs', 'math', 'physics', 'quant-ph', 'econ']
preloaded_queues = {k:queue.Queue(10) for k in preload_topics}
def preload_random(topic):
    p = get_probability_distribution(topic)
    l = get_links(topic)
    d = np.random.choice(l, p=p)
    t = total(d)
    if t != 0:
        paper = parse(d + f'?skip={randint(0,t-1)}&show=5').find('a', title="Abstract").text[6:]
        link = 'https://arxiv.org/abs/' + paper
        feed = feedparser.parse(f'http://export.arxiv.org/api/query?id_list={paper}')
        entry = feed['entries'][0] 
        preloaded_queues[topic].put(entry)

@app.route('/<topic>', methods=['GET'])
def random(topic):
    if topic not in taxonomy:
        return abort(500)

    if not preloaded_queues[topic].full(): 
        exc = ThreadPoolExecutor(max_workers=2)
        for _ in range(2):
            exc.submit(preload_random, topic)
        exc.shutdown(wait=False)

    if not preloaded_queues[topic].empty():
        entry = preloaded_queues[topic].get()
        return render_template('topic.html',
                           description=entry['description'],
                           title=entry['title'],
                           link=entry['link'],
                           authors=entry['authors'],
                           published=time.strftime('%d %b %Y',
                                                   entry['published_parsed']))

    l = get_links(topic)
    p = get_probability_distribution(topic)
    d = np.random.choice(l, p=p)

    t = total(d)
    if t == 0:
        return random(topic)
    paper = parse(d + f'?skip={randint(0,t-1)}&show=5').find(
        'a', title="Abstract").text[6:]
    link = 'https://arxiv.org/abs/' + paper
    feed = feedparser.parse(
        f'http://export.arxiv.org/api/query?id_list={paper}')
    entry = feed['entries'][0]

    return render_template('topic.html',
                           description=entry['description'],
                           title=entry['title'],
                           link=link,
                           authors=entry['authors'],
                           published=time.strftime('%d %b %Y',
                                                   entry['published_parsed']))


@app.route('/')
def index():
    return render_template('index.html', taxonomy=taxonomy)

#@app.before_first_request
#def before_first_request():
#    # preload common topics
#    #with mp.Pool(processes=len(preload_topics)) as pool:
#A    #    pool.map(get_probability_distribution, preload_topics)
#    with ThreadPoolExecutor(max_workers=len(preload_topics)) as exc:
#        for t in preload_topics:
#            exc.submit(preload_random, t)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)