diff --git a/compare50/__main__.py b/compare50/__main__.py index bc6212f..ad98b36 100644 --- a/compare50/__main__.py +++ b/compare50/__main__.py @@ -19,6 +19,7 @@ import attr import lib50 import termcolor +import pygments from . import comparators, _api, _data, _renderer, __version__ @@ -245,6 +246,26 @@ def get_non_empty_subs(subs): def get_undecodable_files(subs): return [sub.path / file for sub in subs for file in sub.undecodable_files] + def warn_txt_files(subs): + """Warn about plaintext or code in file when the user submits a .txt file""" + for sub in subs: + for file in sub.files: + if file.name.suffix == ".txt": + try: + lexer = pygments.lexers.guess_lexer(file.read()) + except pygments.util.ClassNotFound: + lexer = pygments.lexers.special.TextLexer() + + # If the file is interpreted as a plaintext file + if isinstance(lexer, pygments.lexers.special.TextLexer): + termcolor.cprint( + f"{file.name.name} appears to be plaintext. Scoring using `structure` could produce unexpected results.", + "yellow", attrs=["bold"]) + else: + termcolor.cprint( + f"{file.name.name} is a .txt file that appears to contain code. Interpreting as: {lexer.name}", + "yellow", attrs=["bold"]) + # Print the number of subs, archives, distro files, and the average number of files per sub n_subs = len(get_non_empty_subs(subs)) n_archives = len(get_non_empty_subs(archives)) @@ -285,6 +306,11 @@ def get_undecodable_files(subs): print_warning(undecodable, undecodable_archive, undecodable_distro, "non utf-8") did_print_warning = True + # Warn about txt files + warn_txt_files(subs) + warn_txt_files(archives) + warn_txt_files(distro_subs) + # Print suggestion to run with --verbose if any files are excluded if not verbose and did_print_warning: termcolor.cprint("Rerun with --verbose to see which files are excluded", diff --git a/compare50/_data.py b/compare50/_data.py index c9b17d7..c1b76f4 100644 --- a/compare50/_data.py +++ b/compare50/_data.py @@ -183,7 +183,19 @@ def tokens(self): def lexer(self): """Determine which Pygments lexer should be used.""" + def read_and_guess_lexer(): + """Reads a file, guesses an appropriate lexer. Fallback to plaintext.""" + try: + return pygments.lexers.guess_lexer(self.read()) + except pygments.util.ClassNotFound: + return pygments.lexers.special.TextLexer() + ext = self.name.suffix + + # if this is a txt file, assume its some kind of code and infer its lexer + if ext == '.txt': + return read_and_guess_lexer() + try: return self._lexer_cache[ext] except KeyError: @@ -195,10 +207,7 @@ def lexer(self): self._lexer_cache[ext] = lexer return lexer except pygments.util.ClassNotFound: - try: - return pygments.lexers.guess_lexer(self.read()) - except pygments.util.ClassNotFound: - return pygments.lexers.special.TextLexer() + return read_and_guess_lexer() @classmethod def get(cls, id): @@ -207,7 +216,7 @@ def get(cls, id): def unprocessed_tokens(self): """Get the raw tokens of the file.""" - text = self.read() + text = self.read() lexer_tokens = self.lexer().get_tokens_unprocessed(text) tokens = [] prevToken = None