You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

207 lines
7.5 KiB

"""Script for copyright/license reports
Dependencies:
- python3
- ripgrep
Assumptions:
- copyright.txt file is in CWD
- copyright.txt is in https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ format
- all relevant files are within CWD and not ignored by git (ripgrep default behaviour)
For xournalpp execute using:
python3 scripts/compare_license.py
Workflow:
1. Run the script
2. In case script exits with status 1 adapt copyright.txt or this script (see comments I, II, III in below code)
3. Rerun script now it should exit with status 0
Note: This script cannot automatically detect whether you added a file that should be licensed differently
but does not indicate this in any way. Please refer to comment II in the code below and add it to the whitelist.
"""
from typing import Set
import re
import os
import subprocess
def get_files_from_copyright_format(file: str) -> Set[str]:
"""Get all Files listed in a copyright file
Args:
- file: file formatted according to https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
"""
with open('copyright.txt', 'r') as f:
lines = f.readlines()
files = set()
for l in lines:
if re.match("^Files: |^ {7}[a-zA-Z0-9/_\-.*]* *$", l):
files.add(l[7:].strip())
return files
def get_all_files():
stdout = os.popen('rg --files').readlines()
files = [f.strip() for f in stdout]
return set(files)
def get_files_containing_copyright_or_license():
"""Find all files containing either
- copyright
- license
(case insensitive)
Exluding .po files as they create only false positives
"""
stdout = os.popen('rg -i -e "copyright" -e "license" -l | rg -v "\.po"').readlines()
files = [f.strip() for f in stdout]
lc_files = set(files)
stdout = os.popen('rg --files-without-match "@author Xournal\+\+"').readlines()
files = [f.strip() for f in stdout]
xpp_files = set(files)
return (lc_files & xpp_files)
def get_changed_files_since(git_hash:str):
stdout = os.popen(f'git diff {git_hash} HEAD --name-only').readlines()
files = [f.strip() for f in stdout]
return set(files)
def get_source_files_missing_license_of_header(scanned_files:Set[str], all_files:Set[str]) -> Set[str]:
"""Return all `.cpp` files which do not have a license but their corrsponding `.h` file has.
Args:
scanned_files (Set[str]): Files which have a license header
all_files (Set[str]): all Files in the project (used for existence check)
"""
scanned_header_files = set()
scanned_source_files = set()
for f in scanned_files:
if f.endswith(".h"):
scanned_header_files.add(f.strip('.h'))
elif f.endswith('.cpp'):
scanned_source_files.add(f.strip('.cpp'))
missing_source_files = scanned_header_files - scanned_source_files
source_file_exists = lambda x: (x+'.cpp') in all_files
return set(filter(source_file_exists, missing_source_files))
# I: Add an entry if a file is detected automatically as a file with special
# license/copyright, but which is actually licensed/copyrighted under the same
# license/copyright as xournalpp.
# Please add a short comment explaining why it's whitelisted
def get_whitelist_not_listed():
"""Whitelist for files containing the searched for substrings but
are not necessary for the copyright.txt"""
white_list = set()
white_list.add("ABOUT-NLS") # false positive
white_list.add("copyright.txt") # copyright/license summary file
white_list.add("scripts/compare_license.py") # this very script
white_list.add("CMakeLists.txt") # false positive
white_list.add("LICENSE") # main license file
white_list.add("rpm/fedora/xournalpp.spec") # false positive
white_list.add("windows-setup/xournalpp.nsi") # false positive
white_list.add("ui/about.glade") # false positive
white_list.add("src/win32/xpp.rc.in") # false positive
return white_list
# II: Add an entry to the whitelist if you added a file which has special
# licensing/copyright but does not contain any of the substrings used to
# automatically identify such files
# The rational should be explained in the copyright.txt file itself.
# Do not use comments in this file to explain the rational.
def get_whitelist_not_found():
"""Whitelist for files listed in copyright.txt but do not include
the searched for substrings"""
white_list = set()
white_list.add("*")
white_list.add("debian/changelog")
white_list.add("debian/compat")
white_list.add("debian/control")
white_list.add("debian/docs")
white_list.add("debian/package_description")
white_list.add("debian/rules")
white_list.add("debian/source/format")
white_list.add("ui/pixmaps/application-x-xojpp.svg")
white_list.add("ui/pixmaps/application-x-xopp.svg")
white_list.add("ui/pixmaps/application-x-xopt.svg")
white_list.add("ui/pixmaps/com.github.xournalpp.xournalpp.png")
white_list.add("ui/pixmaps/com.github.xournalpp.xournalpp.svg")
white_list.add("ui/pixmaps/gnome-mime-application-x-xopp.svg")
white_list.add("ui/pixmaps/gnome-mime-application-x-xopt.svg")
white_list.add("ui/pixmaps/xopt.svg")
white_list.add("ui/iconsColor-dark/*")
white_list.add("ui/iconsColor-light/*")
return white_list
# III: Update git commit hash to current commit once you checked
# that the changes do not affect the licensing information in copyright.txt
last_checked_git_commit_hash = "89eee20c3dab1d1bfc18d125f32d510b6862168d"
changed_files = get_changed_files_since(last_checked_git_commit_hash)
summary_files = get_files_from_copyright_format("copyright.txt")
scanned_files = get_files_containing_copyright_or_license()
found = summary_files & scanned_files
not_found = summary_files - scanned_files - get_whitelist_not_found()
not_listed = scanned_files - summary_files - get_whitelist_not_listed()
# Copyright could change with the same commit. Hence, it needs to be exluded.
all_whitelisted = (get_whitelist_not_found() | get_whitelist_not_listed()) - set(["copyright.txt"])
# Files inside copyright.txt or mentioned in whitelist should be checked for
# diffs affecting the license/copyright
out_of_date = (all_whitelisted | summary_files) & changed_files
missing_source_license = get_source_files_missing_license_of_header(scanned_files, get_all_files())
print("Found License/Copyright both in copyright.txt and repo: ",len(found))
if not_listed:
print()
print("No License/Copyright listed in copyright.txt (but found in repo):")
for f in sorted(not_listed):
print(" ", f)
else:
print("- All automatically detected files listed or whitelisted")
if not_found:
print()
print("No License/Copyright found in repo (but listed in copyright.txt):")
for f in sorted(not_found):
print(" ", f)
else:
print("- All listed files automatically detected or whitelisted")
if out_of_date:
print()
print("Following items are whitelisted or listed in copyright.txt but changed since last check:")
for f in sorted(out_of_date):
print(" ", f)
else:
print("- No listed file got changed since the last check.")
if missing_source_license:
print()
print("Following `.cpp` files do NOT contain a license even though their accompanying `.h` file does.")
for f in sorted(missing_source_license):
print(" ", f)
if not_found or not_listed:
print(" Update required")
exit(1)
if out_of_date:
" Recheck required"
exit(1)
if missing_source_license:
" Adding license header required"
exit(1)
print("🎉 Success")
exit(0)