Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions compare50/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import attr
import lib50
import termcolor
import pygments

from . import comparators, _api, _data, _renderer, __version__

Expand Down Expand Up @@ -245,6 +246,26 @@ def get_non_empty_subs(subs):
def get_undecodable_files(subs):
return [sub.path / file for sub in subs for file in sub.undecodable_files]

def warn_txt_files(subs):
"""Warn about plaintext or code in file when the user submits a .txt file"""
for sub in subs:
for file in sub.files:
if file.name.suffix == ".txt":
try:
lexer = pygments.lexers.guess_lexer(file.read())
except pygments.util.ClassNotFound:
lexer = pygments.lexers.special.TextLexer()

# If the file is interpreted as a plaintext file
if isinstance(lexer, pygments.lexers.special.TextLexer):
termcolor.cprint(
f"{file.name.name} appears to be plaintext. Scoring using `structure` could produce unexpected results.",
"yellow", attrs=["bold"])
else:
termcolor.cprint(
f"{file.name.name} is a .txt file that appears to contain code. Interpreting as: {lexer.name}",
"yellow", attrs=["bold"])

# Print the number of subs, archives, distro files, and the average number of files per sub
n_subs = len(get_non_empty_subs(subs))
n_archives = len(get_non_empty_subs(archives))
Expand Down Expand Up @@ -285,6 +306,11 @@ def get_undecodable_files(subs):
print_warning(undecodable, undecodable_archive, undecodable_distro, "non utf-8")
did_print_warning = True

# Warn about txt files
warn_txt_files(subs)
warn_txt_files(archives)
warn_txt_files(distro_subs)

# Print suggestion to run with --verbose if any files are excluded
if not verbose and did_print_warning:
termcolor.cprint("Rerun with --verbose to see which files are excluded",
Expand Down
19 changes: 14 additions & 5 deletions compare50/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,19 @@ def tokens(self):

def lexer(self):
"""Determine which Pygments lexer should be used."""
def read_and_guess_lexer():
"""Reads a file, guesses an appropriate lexer. Fallback to plaintext."""
try:
return pygments.lexers.guess_lexer(self.read())
except pygments.util.ClassNotFound:
return pygments.lexers.special.TextLexer()

ext = self.name.suffix

# if this is a txt file, assume its some kind of code and infer its lexer
if ext == '.txt':
return read_and_guess_lexer()

try:
return self._lexer_cache[ext]
except KeyError:
Expand All @@ -195,10 +207,7 @@ def lexer(self):
self._lexer_cache[ext] = lexer
return lexer
except pygments.util.ClassNotFound:
try:
return pygments.lexers.guess_lexer(self.read())
except pygments.util.ClassNotFound:
return pygments.lexers.special.TextLexer()
return read_and_guess_lexer()

@classmethod
def get(cls, id):
Expand All @@ -207,7 +216,7 @@ def get(cls, id):

def unprocessed_tokens(self):
"""Get the raw tokens of the file."""
text = self.read()
text = self.read()
lexer_tokens = self.lexer().get_tokens_unprocessed(text)
tokens = []
prevToken = None
Expand Down