Source code for snowballing.config

"""This module configures the snowballing.

Please, use the database __init__ to replace these configurations.
"""
import textwrap
from copy import copy
from pathlib import Path

from string import ascii_lowercase
from bibtexparser.customization import homogenize_latex_encoding
from IPython.display import HTML
from selenium import webdriver

from .collection_helpers import callable_get, define_cvar
from .collection_helpers import consume, setitem, remove_empty
from .config_helpers import reorder_place, last_name_first_author
from .config_helpers import var_item, str_list, str_item, sequence
from .config_helpers import work_by_varname, find_work_by_info, Site
from .config_helpers import generate_title
from .rules import ModifyRules
from .utils import compare_str, match_any, to_list

# Tool version
JOHN_SNOW_VERSION = "1.0.0"

# Database path
DATABASE_DIR = Path.home() / "database"

# Text editior path
TEXT_EDITOR = None
# Text editor argument for opening in a given line.
# Use a format string with arguments {year_path} and {line}
LINE_PARAMS = None

# Web Driver
WEB_DRIVER = lambda: webdriver.Chrome()

# Run widget
# Use True to indicate that widgets that generate executable code should have a text area with a button for running the code.
# Otherwise, they modify notebook cells (unsafe)
RUN_WIDGET = True 

# PDF Extractor. Use string argumen {path} to define the path
PDF_EXTRACTOR = None
PDF_EXTRACTOR = "java -jar refExtractor.jar full {path}"

# List of possible work class tuples
# Each tuple has the follwing elements:
#   Class Name
#   Category name
#   Graph visibility (Options: display, hide, always_hide)
#   Graph node color
#   Graph node text color
CLASSES = [
    ("Work", "work", "display", "#FFD86E", "black"),
    ("WorkSnowball", "snowball", "display", "#6DCE9E", "white"),
    ("WorkOk", "ok", "display", "#68BDF6", "white"),
    ("WorkUnrelated", "unrelated", "hide", "#DE9BF9", "white"),
    ("WorkNoFile", "nofile", "hide", "#A5ABB6", "white"),
    ("WorkLang", "lang", "hide", "#ff8040", "white"),
    ("Site", "site", "hide", "#000080", "white"),
    ("Email", "site", "hide", "#000080", "white"),
]
# Default class for insertion
DEFAULT_CLASS = "Work"

# Similary Ratio for matching places
SIMILARITY_RATIO = 0.8

# Check Deprecation
CHECK_DEPRECATION = True


### Fields

# Debug fields during BibTeX export
DEBUG_FIELDS = True

# List of exportable work fields to BibTeX
WORK_FIELDS = [
    "entrytype", "year", "name", "authors", "place",
    "booktitle", "bookauthors", "edition", "available",
    "volume", "number", "section", "pp", "article",
    "doi", "isbn",  "proceedings", "issn",
    "organization", "publisher", "school", "institution", "track",
    "ref", "local", "editors", "awards",
    "special", "website", "link", "scholar", "shorttitle", "address",
]

# Regexes that starts with ^ and ends with $
BIBTEX_IGNORE_FIELDS = [
    "excerpt", "month", "bookname", "url", "ID",

    # Tool
    "_.*", "force_.*", "file.*", "category", "alias", "aliases", "scholar_ok",
    "scholar", "cluster_id", "scholar_id", "display", "metakey", "due", "_tyear",
    "citation_file", "notes", "tracking", "snowball", "request", "_draw",
    "may_be_related_to", "note",
]


### Transformation Rules

def _place_value(place1):
    """Get place value based on similarity

    It considers a match if the matching ratio for the place name
    is >= config.SIMILARITY_RATIO (0.8).

    It also considers a match if the acronym matches.

    Doctest:

    .. doctest::

        >>> info = {"place1": "IPAW"}
        >>> _place_value("IPAW")
        'IPAW'
        >>> _place_value("Software Engineering, International Conference on")
        'ICSE'
        >>> _place_value("A random conference") is None
        True
    """
    from .operations import load_places_vars
    place = reorder_place(place1)

    maxmatch = max(
        (max(
            compare_str(place, varvalue.name),
            1 if place == varvalue.acronym else 0
        ), varname, varvalue) for varname, varvalue in load_places_vars()
    )

    if maxmatch[0] >= SIMILARITY_RATIO or maxmatch[2].name in place:
        return maxmatch[1]
    
def _pos_diff(work, info, result):
    """Remove from diff if result in list or alias"""
        
    keys = set(result["set"])
    def delkey(key, from_set=True):
        if key in result["set"]:
            del result["set"][key]
        if key in result["show"]:
            del result["show"][key]
        if from_set:
            keys.remove(key)
    if hasattr(work, "ignore") and isinstance(work.ignore, list):
        for key in work.ignore:
            delkey(key)
    if SCHOLAR_MAP["scholar_ok"] in keys:
        after, before = result["set"][SCHOLAR_MAP["scholar_ok"]]
        if before:
            delkey(SCHOLAR_MAP["scholar_ok"])
    if SCHOLAR_MAP["scholar"] in keys:
        after, before = result["set"][SCHOLAR_MAP["scholar"]]
        after = [x.replace("hl=pt-BR", "hl=en") for x in to_list(after or [])]
        before = [x.replace("hl=pt-BR", "hl=en") for x in to_list(before or [])]
        if len(after) == 1:
            after = after[0]
        if len(before) == 1:
            before = before[0]
        result["set"][SCHOLAR_MAP["scholar"]] = (after, before) 
    aliases = get_work_aliases(work)
    if ATTR["name"] in keys:
        after, before = result["set"][ATTR["name"]]
        for alias in aliases:
            if alias[1] == after:
                delkey(ATTR["name"])
    if "authors" in keys:
        after, before = result["set"]["authors"]
        for alias in aliases:
            if len(alias) > 2 and alias[2] == after:
                delkey("authors")
                break
    for key in keys:
        after, before = result["set"][key]
        if isinstance(before, (list, tuple)) and after in before or after == before:
            delkey(key, from_set=False)
        

# Map BibTex to Info object
BIBTEX_TO_INFO = {
    "<before>": [
        ("place1", ""),
        ("year", lambda x: int(
            (x["year"][:4] or 0) if "[in press]" in x.get("year", "")
            else x.get("year", 0)
        )),
    ],
    "<middle>": [
        ("place", lambda old, new: _place_value(new["place1"])),
        ("display", lambda old, new: last_name_first_author(new["authors"])),
        ("pyref", lambda old, new: info_to_pyref(new)),
    ],
    "<after>": [
        ("name", lambda old, new, current: new["name"].replace("{", "").replace("}", ""))
    ],
    "title": ["name"],
    "author": ["authors"],
    "address": ["local"],
    "publisher": ["organization"],
    "pages": ["pp"],
    "ENTRYTYPE": ["entrytype"],
    "journal": ["place1"],
    "booktitle": ["place1"],
    "year": [
        ("note", lambda x: "in press" if "[in press]" in x.get("year", "") else None)
    ],
    "<article>": [
        ("excerpt", lambda article, new: article.attrs["excerpt"][0]),
        ("cluster_id", lambda article, new: article.attrs["cluster_id"][0]),
        ("scholar", lambda article, new: article.attrs["url_citations"][0]),
        ("div", lambda article, new: article.div),
    ],
    "<set_ignore_keys>": {"excerpt", "div", "pyref", "display", "place", "_work_type", "_ref"},
    "<set_ignore_but_show>": {"place1", "year"},
    "<set_always_show>": {"name", "authors", "entrytype", "place1", "year"},
    "<set_order>": ["name", "authors", "entrytype", "place1", "year"],
    "<scholar_ok>": "scholar_ok",
    "<set_before>": lambda work, info: [
        setattr(work, "entrytype", work.place.type)
            if not hasattr(work, "entrytype")
            and hasattr(work, "place") else 
            None,
        setattr(info, "place1", info["place"])
            if "place" in info
            and not "place1" in info else
            None,
        setattr(work, "place1", "{} ({})".format(work.place.name, work.place.acronym))
            if hasattr(work, "place") else
            None,
    ],
    "<pos_diff>": _pos_diff
}


# Map BibTex to Info object. Set _work_type=Work
BIBTEX_TO_INFO_WITH_TYPE = (
    ModifyRules(BIBTEX_TO_INFO, "with_type")
    .append("<before>", ("_work_type", "Work"))
).rules


# Map BibTex to Info object. Ignore ID attribute
BIBTEX_TO_INFO_IGNORE_SET_ID = (
    ModifyRules(BIBTEX_TO_INFO, "ignore_set")
    .add("<set_ignore_keys>", "ID")
).rules


# Convert Article info to Info object
#   If the work exists, it should populate the attribute _nwork (use find_work_by_info)
ARTICLE_TO_INFO = {
    "<after>": [
        (None, lambda old, new, current:
            [
                setitem(new, "pyref", 'Site("{name}", "{url}")'.format(**new)),
                setitem(new, "_nwork", Site(new["name"], new["url"]))
            ] if new.get("_work_type") == "Site" else
            [
                setitem(new, "_nwork", work_by_varname(new["pyref"])),
                setitem(new, "name", new["_nwork"].name),
                setitem(new, "place", new["_nwork"].place.name),
            ] if new.get("_work_type") == "Ref" else 
            [
                setitem(new, "place", _place_value(new["place1"])) if "place" not in new else None,
                setitem(new, "display", last_name_first_author(new["authors"])) if "display" not in new else None,
                setitem(new, "pyref", info_to_pyref(new)) if "pyref" not in new else None,
                setitem(new, "_nwork", find_work_by_info(new, set())),
            ]
        
        )
    ],
    "div": [None],
    "citation_id": [None],
    "<article>": [
        (None, lambda article, new: 
            [
                setitem(article, "_show_site", True),
                setitem(article, "_ref", article.get("citation_id", "")),
            ] if new.get("_work_type") == "Site" else
            [
                setitem(article, "name", new.get("name")),
                setitem(article, "place", new.get("place")),
                setitem(article, "_ref", article.get("citation_id", "")),
            ] if new.get("_work_type") == "Ref" else [
                setitem(article, "_ref", article.get("citation_id", "")),
            ]
           
        ),
    ]
}


# Map Info object to Insert Code
INFO_TO_INSERT = {
    "<before>": [
        ("work_type", "Work"), ("due", ""), ("related", ""),
        ("display", ""), ("authors", ""), ("the_place", ""), ("other", ""),
    ],
    "<middle>": [
        ("the_place", sequence([
            var_item("place", obj="new"),
            str_item("place1", obj="new"), 
        ])),
        ("other", lambda old, new, current: [
            "{}={!r},".format(key, consume(current, key)) # .replace('"', r'\"')
            for key in copy(current)
        ]),
        ("attributes", lambda old, new: "\n            ".join(remove_empty([
            new["due"], new["related"], new["display"],
            new["authors"], new["the_place"]
        ] + new["other"]))),
    ],
    "<result>": lambda old, new, current: textwrap.dedent("""
        {pyref} = DB({work_type}(
            {year}, "{name}",
            {attributes}
        ))""".format(**new)),
    "_work_type": ["work_type"],
    "year": ["year"],
    "name": ["name"],
    "authors": [("authors", str_item("authors"))],
    "display": [("display", str_item("display"))],
    "pyref": ["pyref"],
    "place": ["place"],
    "place1": ["place1"],
    "may_be_related_to": [("related", str_list("may_be_related_to"))],
    "due": [("due", str_item("due"))] ,
    "excerpt": [None],
    "_ref": [None],
}


# Modify Info object for finding work
#   <skip> and <ignore> only exist for this
FIND_INFO_WORK = {
    "<skip>": [
        ("_work_type", "Site"),
    ],
    "<ignore>": ["_year", "_work", "_key"],
    "<before>": [
        ("place", lambda old: 
            None if "entrytype" not in old else
            "Thesis" if old["entrytype"] in ("phdthesis", "mastersthesis") else
            "TechReport" if old["entrytype"] == "techreport" else
            "Book" if old["entrytype"] == "book" else
            None
        ), 
        ("_work", lambda old: work_by_varname(old["ID"]) if "ID" in old else None),
        ("_key", lambda old: old["ID"] if "ID" in old else None),
    ],
    "year": ["year", "_year"],
    "school": ["local"],
}


# Convert Work into BibTex dict
#   The "new" object starts with three parameters
#     _name: override metakey name
#     _homogeneize: bibtex homogeneize
#     _acronym: use acronym instead of full place text
#   The result do not need these attributes
def _work_to_bibtex_middle(work, new, current):
    if DEBUG_FIELDS:
        use = callable_get(WORK_TO_BIBTEX, "<use>", [])
        ignore = callable_get(WORK_TO_BIBTEX, "<ignore>", [])
        for key in dir(work):
            if not match_any(key, use) and not match_any(key, ignore):
                print("[DEBUG WARNING]", work.display, work.year, key)
    new_place_key = {
        "incollection": "booktitle",
        "inproceedings": "booktitle",
        "misc": "booktitle",
        "article": "journal",
        "book": None,
        "mastersthesis": None,
        "phdthesis": None,
        "techreport": None,
        "": None
    }[new.get("ENTRYTYPE", "")]
    if new_place_key and hasattr(work, "place"):
        conference_has_acronym = (
            new["_acronym"]
            and work.place.type == "Conference"
            and not work.place.acronym.startswith("<")
        )
        if conference_has_acronym:
            new[new_place_key] = str(work.place.acronym)
        else:
            new[new_place_key] = str(work.place)
    elif new_place_key and hasattr(work, "place1"):
        new[new_place_key] = str(work.place1)

WORK_TO_BIBTEX = {
    "<before>": [
        ("ID", lambda work, new, current: (
            new["_name"] if new.get("_name", None) else
            work.metakey if hasattr(work, "metakey") else
            (lambda split: (split[0] if split else "a") + str(work.year))(work.authors.split())
        )),
    ],
    "<middle>": [
        (None, _work_to_bibtex_middle),
    ],
    "<ignore>": lambda: BIBTEX_IGNORE_FIELDS + ["place", "place1"],
    "<use>": lambda: WORK_FIELDS,
    "<result>": lambda work, new, current: (
        [
            consume(new, "_name"),
            consume(new, "_acronym"),
            homogenize_latex_encoding(new) if consume(new, "_homogeneize") else new,
        ][-1]
    ), 
    "name": ["title"],
    "authors": ["author"],
    "local": ["address"],
    "organization": ["publisher"],
    "pp": ["pages"],
    "entrytype": ["ENTRYTYPE"],
}


### Snowballing Forms

[docs]def convert_citation_text_lines_to_info(text): """Convert lines of format [N] author name place other year' to Info object If it is an invalid format, it should return "Incomplete". Default: Recognizes 3 patterns: - Citation: [N] > varname - Site: [N] SiteName http:// - Work: [N] author name place other year - 'other' can be either lines with values, or use the format key=value """ lines = text.strip().split("\n") info = { "citation_id": lines[0].strip(), } found = False other = [] if lines[-1].strip().startswith(">") and len(lines) >= 2: # [N] > varname info["pyref"] = lines[-1][1:].strip() info["_work_type"] = "Ref" found = True other = lines[1:-1] elif lines[-1].strip().startswith("http") and len(lines) >= 3: # [N] WebName http://... info["name"] = lines[1].strip() info["url"] = lines[-1].strip() info["_work_type"] = "Site" found = True other = lines[2:-1] elif len(lines) >= 5 and lines[-1].strip().isnumeric(): # [N] author name place other year info["authors"] = lines[1].strip() info["name"] = lines[2].strip() info["place1"] = lines[3].strip() info["year"] = int(lines[-1].strip()) info["_work_type"] = "Work" found = True other = lines[4:-1] if found: for num, line in zip(range(1, 10000), other): line = line.strip() split = line.split("=") if len(split) > 1: info[split[0]] = "=".join(split[1:]) else: info["other{}".format(num)] = line return info return "Incomplete"
[docs]def info_to_pyref(info): """Create pyref for info Doctest: .. doctest:: >>> info_to_pyref({'display': 'pimentel', 'year': 2017}) 'pimentel2017a' >>> info_to_pyref({'display': 'pimentel', 'year': 2015}) 'pimentel2015b' """ pyref = "{display}{year}".format(**info) for letter in ascii_lowercase: if not work_by_varname(pyref + letter): break pyref += letter return pyref
[docs]def set_info_letter(info, letter): """Set letter of info object""" if info["pyref"][-1].isalpha(): info["pyref"] = info["pyref"][:-1] info["pyref"] += letter info["display"] += " " + letter
# Insert form FORM = { # List of widgets # Each widget has at least 3 fields # Type (text, dropdown, toggle, button) # Label # Variable # Some widget requ-ire extra fields # text, dropdown, and toggle: 4th field indicates the value # dropdown: 5th field is a list with options "widgets": [ # Base [ "dropdown", "Type", "work_type", DEFAULT_CLASS, [tup[0] for tup in CLASSES] ], ["toggle", "File", "file_field", False], ["text", "Due", "due", ""], ["text", "Place", "place", ""], ["text", "Year", "year", ""], ["text", "Prefix Var", "prefix", ""], ["text", "PDFPage", "pdfpage", ""], ["text", "Related", "may_be_related_to", None], ["text", "Display", "display", None], ["text", "Link", "link", None], # Buttons ["button", "Ok", "_b1"], ], # List of events # Each event has 3 fields # Widget variable # Listener (observe, click) # Action. It can be: # A dictionary indicating which values to chance # A list with the action (first item is a str) and its parameters (other actions) # A list of actions to run in sequence "events": [ # Base ["due", "observe", [ ["if", ["and", ["!=", ":due", ""], ["==", ":work_type", "Work"] ], { "work_type": "WorkUnrelated", }, ["if", ["and", ["==", ":due", ""], ["==", ":work_type", "WorkUnrelated"] ], { "work_type": "Work" }, [] ] ] ]], ["place", "observe", [ ["if", ["and", ["==", ":place", "Lang"], ["==", ":work_type", "Work"] ], { "work_type": "WorkLang" }, [] ] ]], # Buttons ["_b1", "click", [ { "work_type": "WorkOk", "file_field": True, }, ["reload"], ]], ], # Exhibition of the widges. # Each list indicates a row with other widgets # I suggest using no more than 4 buttons in a row # and no more than 2 inputs in a row "order": [ ["_b1"], ["work_type", "file_field"], ["due", "place"], ["year", "prefix"], ["may_be_related_to", "display"], ["summary", "link"], ], # Show operation # Use the same "DSL" as the events to update info object. "show": [ ["if", ["==", ".place", "Lang"], {"work_type": "WorkLang"}, [] ], ["update_info", "due", ":due", None, ""], ["update_info", "place", ":place", None, ""], ["update_info", "_work_type", ":work_type", None, "Work"], ["if", ["or", ["update_info", "year", ":year", None, ""], ["update_info", "display", ":prefix", None, ""], ], ["pyref"], [] ], ["update_info", "file", ":file_field", ["+", ".pyref", r"\.pdf"], False], ["update_info", "file", ":pdfpage", ["+", ".file", "#page=", ":pdfpage"], ""], ["update_info", "may_be_related_to", ":may_be_related_to", None, ""], ["update_info", "display", ":display", None, ""], ["update_info", "link", ":link", None, ""], ] } ### Checks
[docs]def check_insertion(nwork, info, citation_var, citation_file, should_add, ref=""): """Check info validity after executing snowballing.create_info_code Return dictionary of warning fields Default: returns place1 if place is not defined and always returns the pdf name """ result = {} result["pdf"] = "{}.pdf".format(info["pyref"]) if "place" not in info and info.get("_work_type") not in ("Site", "Ref"): result["place1"] = info["place1"] return result
[docs]def check_load(work, varname, warning=lambda x: print(x)): """Check conditions on load""" if not hasattr(work, "scholar_id"): warning("[Warning] Work {} does not have scholar_id".format(varname)) if getattr(work, "place", None) is None: warning("[Error] Work {} does not have place".format(varname))
### Snowballing display
[docs]def display_article(article): """Display article in widget""" if "div" in article: return [ HTML(""" <style> .gs_or_svg { position: relative; width: 29px; height: 16px; vertical-align: text-bottom; fill: none; stroke: #1a0dab; } </style> """), HTML(repr(article["div"])) ] else: return [ article["name"] ]
### Query Scholar
[docs]def query_str(work): """Return string to query work on scholar""" return work.name + " " + work.authors
### Work Attributes
[docs]def work_post_init(work): """Instructions to execute after Work __init__ Default: set display to title if display is None """ if getattr(work, "display", None) is None: work.display = work.name
[docs]def work_eq(first, second): """Compare work Default: compare place, title and year """ if getattr(first, "place", None) != getattr(second, "place", None): return False return ( first.name == second.name and first.year == second.year )
[docs]def work_hash(work): """Uniquely identify work Default: use title and year """ return hash((work.name, work.year))
[docs]def info_work_match(info, work): """Check if an Info object matches to a Work object Default: checks for exact matches by cluster_id, scholar_id, (year, name, authors) of aliases and similar titles It uses matches on places and years to reduce the similarity requirements for matching titles """ if info.get("cluster_id", "<ii>") in getattr(work, "cluster_id", "<iw>"): return True if info.get("scholar_id", "<ii>") in getattr(work, "scholar_id", "<iw>"): return True for alias in get_work_aliases(work): condition = ( alias[0] == info["year"] and alias[1] == info["name"] and ( len(alias) > 2 and alias[2] == info["authors"] or getattr(work, "authors", None) == info["authors"] ) ) if condition: return True required = 0.9 if info["year"] == 0: required += 0.1 same_place = ( "place" in info and hasattr(work, "place") and getattr( getattr(MODULES["places"], info["place"], None), "name", "<ii>" ) == getattr( getattr(work, "place", None), "name", "<iw>" ) ) if same_place: required -= 0.1 if compare_str(str(getattr(work, "name")), info.get("name")) > required: return True return False
[docs]def work_display(work): """Get work display""" return work.display
### Aliases
[docs]def get_work_aliases(work): """Get list of aliases from work Default: the tool uses the attributes "alias" and "aliases" to represent aliases """ if hasattr(work, "alias"): return [work.alias] if hasattr(work, "aliases"): return work.aliases return []
[docs]def get_alias_year(work, alias): """Get year from alias Default: return first element of tuple/list that represents the alias """ return alias[0]
### Graph Attributes
[docs]def graph_place_text(work): """Get place text for graph Default: return place acronym """ if getattr(work, "place", None) is not None: return work.place.acronym
[docs]def graph_place_tooltip(work): """Generate place tooltip Default: tooltip with all information from Place object """ return generate_title(work.place, prepend="")
[docs]def work_tooltip(work): """Generate work tooltip Default: tooltip with paper title and authors """ from .operations import work_to_bibtex return ( "{}\n{}".format(work.name, work.authors) + "\n\n" + work_to_bibtex(work) )
[docs]def citation_tooltip(citation): """Generate citation tooltip""" return ( "{0} -> {1}".format(citation.work.metakey, citation.citation.metakey) + generate_title(citation, ignore={"_.*", "work", "citation"}) )
### Approaches APPROACH_FORCE_PREFIX = "force_" APPROACH_RELATED_CATEGORY = "Related"
[docs]def approach_ids_from_work(approach, works): for work in approach.work: if work in works and ("snowball" in work.category or "ok" in work.category): yield works[work]["ID"]
[docs]def approach_display(approach): """Get approach display""" return work_display(approach).replace(" ", "")
### Database # Module setting MODULES = { "places": None, "work": None, "citations": None, "groups": None, } # Map of Work attributes used by the tool # I advise against changing this variable since I may have forgotten to replace an attribute in a function ATTR = { # Work "category": "category", "metakey": "metakey", "pyref": "pyref", "year": "year", "name": "name", "site_link": "link", "email_authors": "authors", "citation_file": "citation_file", # Approach "approach_dont_cite": "dont_cite", "approach_work": "work", } # Similar to ATTR variable, bu provide a direct map from existing keys from scholar SCHOLAR_MAP = { "scholar_id": "scholar_id", "cluster_id": "cluster_id", "scholar": "scholar", "scholar_ok": "scholar_ok", } define_cvar(ATTR)