diff options
Diffstat (limited to 'app.py')
-rw-r--r-- | app.py | 35 |
1 files changed, 20 insertions, 15 deletions
diff --git a/app.py b/app.py index 7793ce6..1ef8d64 100644 --- a/app.py +++ b/app.py @@ -16,28 +16,31 @@ from concurrent.futures import ThreadPoolExecutor #mp = multiprocessing.get_context('spawn') -parse = lambda l: bs(urllib.request.urlopen(l), 'html.parser') +def parse(l): + u = urllib.request.urlopen(l) + b = bs(u, 'html.parser') + return b @cache def total(l): - r = re.search('total of ([0-9]*)', parse(l).find('small').text) - if r is None: - return 0 - return int(r.group(1)) + p = parse(l) + r = p.text.split('Total of')[1].split()[0] + return int(r) or 0 @cache def get_links(category="cs"): - link = ("https://export.arxiv.org/list/" + category + "/{}{}").format - - first = parse(link('00', '00')) - start = datetime.strptime( - first.find('a', - href=re.compile('/list/' + category + '/[0-9]{4}')).text, - '%y%m') + link = ("https://export.arxiv.org/list/" + category + "/{}-{}").format + + #first = parse(link('19', '92')) + #start = datetime.strptime( + # first.find('a', + # href=re.compile('/list/' + category + '/[0-9]{4}')).text, + # '%y%m') + start = datetime.strptime('1992', '%Y') return [ - link(t.strftime('%y'), t.strftime('%m')) + link(t.strftime("%Y"), t.strftime('%m')) for t in rrule(MONTHLY, dtstart=start, until=datetime.now()) ] @@ -77,7 +80,8 @@ def preload_random(topic): d = np.random.choice(l, p=p) t = total(d) if t != 0: - paper = parse(d + f'?skip={randint(0,t-1)}&show=5').find('a', title="Abstract").text[6:] + l = d + f'?skip={randint(0,t-1)}&show=25' + paper = parse(l).find('a', title="Abstract").text[6:] link = 'https://arxiv.org/abs/' + paper feed = feedparser.parse(f'http://export.arxiv.org/api/query?id_list={paper}') entry = feed['entries'][0] @@ -112,8 +116,9 @@ def random(topic): t = total(d) if t == 0: return random(topic) - paper = parse(d + f'?skip={randint(0,t-1)}&show=5').find( + paper = parse(d + f'?skip={randint(0,t-1)}&show=25').find( 'a', title="Abstract").text[6:] + paper = paper.split(':')[1] link = 'https://arxiv.org/abs/' + paper feed = feedparser.parse( f'http://export.arxiv.org/api/query?id_list={paper}') |