Skip to content

Commit ffa6a8e

Browse files
committed
Properly encode characters used in URL for arxiv and semanticscholar backends
1 parent 65f3d87 commit ffa6a8e

File tree

2 files changed

+12
-11
lines changed

2 files changed

+12
-11
lines changed

litstudy/sources/arxiv.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import Optional, List
33
import feedparser # type: ignore
44
from datetime import datetime
5+
from urllib.parse import urlencode
56
import time
67

78

@@ -60,6 +61,8 @@ def category(self) -> Optional[List[str]]:
6061
'''returns arxiv category for article'''
6162
return self.entry.get('tags', None)[0].get('term', None)
6263

64+
# Base api query url
65+
ARXIV_SEARCH_URL = 'http://export.arxiv.org/api/query'
6366

6467
def search_arxiv(search_query,
6568
start=0,
@@ -89,16 +92,14 @@ def search_arxiv(search_query,
8992

9093
docs = list()
9194

92-
# Base api query url
93-
base_url = 'http://export.arxiv.org/api/query?'
94-
95-
print(f'Searching arXiv for {search_query}')
96-
9795
for i in range(start, total_results, results_per_iteration):
98-
query = (f'search_query={search_query}&start={i}&max_results='
99-
f'{results_per_iteration}')
96+
query = urlencode(dict(
97+
search_query=search_query,
98+
start=i,
99+
max_results=results_per_iteration
100+
))
100101

101-
url = base_url + query
102+
url = f'{ARXIV_SEARCH_URL}?{query}'
102103
data = feedparser.parse(url)
103104

104105
for entry in data.entries:

litstudy/sources/semanticscholar.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from time import sleep
22
from typing import Tuple, Optional
3-
from urllib.parse import quote_plus
3+
from urllib.parse import urlencode
44
import logging
55
import requests
66
import shelve
@@ -115,11 +115,11 @@ def request_results(query, offset, cache, timeout=DEFAULT_TIMEOUT):
115115

116116

117117
def request_paper(key, cache, timeout=DEFAULT_TIMEOUT):
118-
cache_key = f'paper={key}'
118+
cache_key = urlencode(dict(paper=key))
119119
if cache_key in cache:
120120
return cache[cache_key]
121121

122-
url = S2_PAPER_URL + quote_plus(key)
122+
url = S2_PAPER_URL + cache_key
123123

124124
try:
125125
sleep(timeout)

0 commit comments

Comments
 (0)