JuliaEcosystem · ericphanson · Jul 3, 2023 · Nov 24, 2022 · Jan 14, 2023 · Jan 14, 2023
diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,16 @@
 name = "PackageAnalyzer"
 uuid = "e713c705-17e4-4cec-abe0-95bf5bf3e10c"
 authors = ["Mosè Giordano <[email protected]>"]
-version = "2.0.0"
+version = "3.0.0"
 
 [deps]
+AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 Git = "d7ba0133-e1db-5d97-8f8c-041e4b3a1eb2"
 GitHub = "bc5e4493-9b4d-5f90-b8aa-2b2bcaad7a26"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+JuliaSyntax = "70703baa-626e-46a2-a12c-08ffd08c73b4"
 Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd"
 LicenseCheck = "726dbf0d-6eb6-41af-b36c-cd770e0f00cc"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -20,11 +22,13 @@ Tokei_jll = "3ac119c9-1236-5556-b556-adc8150b0244"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [compat]
+AbstractTrees = "0.4"
 CodecZlib = "0.7"
 Git = "1.2.1"
 GitHub = "5.4"
 Legolas = "0.5"
 JSON3 = "1.5.1"
+JuliaSyntax = "0.4"
 LicenseCheck = "0.2"
 RegistryInstances = "0.1"
 julia = "1.6"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -131,9 +131,9 @@ struct PackageV1
     buildkite::Bool # does it use Buildkite?
     azure_pipelines::Bool # does it use Azure Pipelines?
     gitlab_pipeline::Bool # does it use Gitlab Pipeline?
-    license_files::Vector{LicenseV1}} # a table of all possible license files
+    license_files::Vector{LicenseV1} # a table of all possible license files
     licenses_in_project::Vector{String} # any licenses in the `license` key of the Project.toml
-    lines_of_code::Vector{LinesOfCodeV1} # table of lines of code
+    lines_of_code::Vector{LinesOfCodeV2} # table of lines of code
     contributors::Vector{ContributorsV1} # table of contributor data
     version::Union{String, Missing} # the version number, if a release was analyzed
     tree_hash::String # the tree hash of the code that was analyzed
@@ -142,7 +142,7 @@ end
 
 where:
 * `LicenseV1` contains fields `license_filename::String, licenses_found::Vector{String}, license_file_percent_covered::Float64`,
-* `LinesOfCodeV1` contains fields `directory::String, language::Symbol, sublanguage::Union{Nothing, Symbol}, files::Int, code::Int, comments::Int, blanks::Int`,
+* `LinesOfCodeV2` contains fields `directory::String, language::Symbol, sublanguage::Union{Nothing, Symbol}, files::Int, code::Int, comments::Int, blanks::Int`,
 * and `ContributorsV1` contains fields `login::Union{String,Missing}, id::Union{Int,Missing}, name::Union{String,Missing}, type::String, contributions::Int`.
 
 

diff --git a/src/LineCategories.jl b/src/LineCategories.jl
@@ -0,0 +1,117 @@
+# Here, we assign a category to every line of a file, with help from JuliaSyntax
+# Module to make it easier w/r/t/ import clashes
+module CategorizeLines
+export LineCategories, LineCategory, Blank, Code, Docstring, Comment, categorize_lines!
+
+using JuliaSyntax: GreenNode, is_trivia, haschildren, is_error, children, span, SourceFile, source_location, Kind, kind, @K_str
+
+# Every line will have a single category. This way the total number across all categories
+# equals the total number of lines. This is useful for debugging and is reassuring to users.
+# However, a line may have multiple things on it, including comments, docstrings, code, etc.
+# We will choose the single category by a simple precedence rule, given by the following ordering.
+
+# Some constructs should apply to all lines between them counting, while other's shouldn't. For example, `module ... end` should have `module` and `end` counting
+# as code, but not necessarily all the stuff in between. Whereas for docstrings,
+# if we have a big docstring block, we do want to count all the lines in between as docstring.
+# So in the implementation, we treat `Code` as only applying to the first and last line,
+# while the rest apply to all intermediate lines.
+
+# For the ordering itself, we put `Blank` lowest, since if there's anything else on the line, we want to count it as that.
+# We put `Code` next, since it is the fallback, and we don't want it to override when we have more specific information.
+# Then comment, then docstring, so comments inside of docstrings count as docstrings.
+"""
+    LineCategory
+
+An `enum` corresponding to the possible categorization of a line of Julia source code.
+Currently:
+* `Blank`
+* `Code`
+* `Comment`
+* `Docstring`
+"""
+@enum LineCategory Blank Code Comment Docstring
+
+# We will store the categories assigned to each line in a file with the following structure.
+# This keeps the `SourceFile` to facillitate printing.
+"""
+    LineCategories(path)
+
+Categorize each line in a file as a [`PackageAnalyzer.LineCategory`](@ref).
+Every line is assigned a single category.
+"""
+struct LineCategories
+    source::SourceFile
+    dict::Dict{Int,LineCategory}
+end
+
+# Update the `category` for lines `start_line:ending_line`
+function update!(lc::LineCategories, starting_line::Int, ending_line::Int, category::LineCategory; inclusive)
+    if inclusive
+        range = starting_line:ending_line
+    else
+        range = (starting_line, ending_line)
+    end
+
+    for line in range
+        # Would be nice to have a `Dict` API to do this with a single lookup
+        current = get(lc.dict, line, LineCategory(0))
+        lc.dict[line] = max(current, category)
+    end
+end
+
+# Print back out the source, but with line categories
+function Base.show(io::IO, ::MIME"text/plain", per_line_category::LineCategories)
+    source = per_line_category.source
+    f = true
+    for idx in sort!(collect(keys(per_line_category.dict)))
+        f || println(io)
+        f = false
+        line_start = source.line_starts[idx]
+        # `prevind` since this is the start of the next line, not the end of the previous one
+        line_end = min(prevind(source.code, source.line_starts[idx+1]), lastindex(source.code))
+
+        # One more prevind to chop the last line ending
+        line = SubString(source.code, line_start:prevind(source.code, line_end))
+        print(io, rpad(idx, 5), "| ", rpad(per_line_category.dict[idx], 9), " | ", line)
+    end
+    return nothing
+end
+
+# Based on the recursive printing code for GreenNode's
+# Here, instead of printing, we update our line number information.
+function categorize_lines!(d::LineCategories, node, source, nesting=0, pos=1, parent_kind=nothing)
+    starting_line, _ = source_location(source, pos)
+    k = kind(node)
+
+    # Recurse over children
+    is_leaf = !haschildren(node)
+    if !is_leaf
+        new_nesting = nesting + 1
+        p = pos
+        for x in children(node)
+            categorize_lines!(d, x, source, new_nesting, p, k)
+            p += x.span
+        end
+        ending_line, _ = source_location(source, p)
+    else
+        ending_line = starting_line
+    end
+
+    # Update with the information we have from this level
+    inclusive = true # all inclusive except `Code`
+    if k == K"Comment"
+        line_category = Comment
+    elseif k == K"NewlineWs"
+        line_category = Blank
+    elseif parent_kind == K"doc" && k == K"string"
+        line_category = Docstring
+    else
+        line_category = Code
+        inclusive = false
+    end
+    update!(d, starting_line, ending_line, line_category; inclusive)
+
+    return nothing
+end
+
+end
diff --git a/src/PackageAnalyzer.jl b/src/PackageAnalyzer.jl
@@ -6,11 +6,14 @@ using Pkg, TOML, UUIDs, Printf
 using LicenseCheck # for `find_license` and `is_osi_approved`
 using JSON3 # for interfacing with `tokei` to count lines of code
 using Tokei_jll # count lines of code
-using GitHub # Use GitHub API to get extra information about the repo
-using Git: Git
+import GitHub # Use GitHub API to get extra information about the repo
+import Git: Git
 using Downloads
 using Tar
 using CodecZlib
+using AbstractTrees
+using JuliaSyntax
+using JuliaSyntax: @K_str, kind
 using Legolas
 using Legolas: @schema, @version
 
@@ -20,7 +23,34 @@ using RegistryInstances: RegistryInstances, reachable_registries, PkgEntry
 # Ways to find packages
 export find_package, find_packages, find_packages_in_manifest
 # Ways to analyze them
-export analyze, analyze_manifest, analyze_packages
+export analyze, analyze_manifest, analyze_packages, LineCategories
+
+##
+# Borrowed from
+# https://github.com/beacon-biosignals/SlackThreads.jl/blob/74351c2863ec9a1cf22732873d4d2816aa9c140d/src/SlackThreads.jl#L27-L49
+const CATCH_EXCEPTIONS = Ref(true)
+
+# We turn off exception handling for our tests, to ensure we aren't throwing exceptions
+# that we're missing. But we have it on by default, since in ordinary usage we want to
+# be sure we are catching all exceptions.
+macro maybecatch(expr, log_str, ret=nothing)
+    quote
+        try
+            $(esc(expr))
+        catch e
+            if $(CATCH_EXCEPTIONS)[]
+                @debug $(esc(log_str)) exception = (e, catch_backtrace())
+                $(esc(ret))
+            else
+                # No stacktrace, because we'll get one anyway
+                @debug $(esc(log_str)) exception = e
+                rethrow()
+            end
+        end
+    end
+end
+#
+##
 
 # To support (de)-serialization
 export PackageV1, PackageV1SchemaVersion
@@ -44,14 +74,15 @@ end
 
 @schema "package-analyzer.lines-of-code" LinesOfCode
 
-@version LinesOfCodeV1 begin
+@version LinesOfCodeV2 begin
     directory::String
     language::Symbol
     sublanguage::Union{Nothing, Symbol}
     files::Int
     code::Int
     comments::Int
     blanks::Int
+    docstrings::Union{Missing, Int}
 end
 
 @schema "package-analyzer.contributions" Contributions
@@ -67,9 +98,16 @@ end
 
 @schema "package-analyzer.package" Package
 
+# Handle version serialization
+# https://github.com/apache/arrow-julia/issues/461
 convert_version(::Missing) = missing
 convert_version(::Nothing) = missing
 convert_version(v::Any) = string(v)
+
+# Upgrade V1's
+upgrade_lines_of_code(loc::Vector{LinesOfCodeV2}) = loc
+upgrade_lines_of_code(loc) = LinesOfCodeV2.(loc)
+
 @version PackageV1 begin
     name::String # name of the package
     uuid::UUID # uuid of the package
@@ -89,13 +127,14 @@ convert_version(v::Any) = string(v)
     gitlab_pipeline::Bool # does it use Gitlab Pipeline?
     license_files::Vector{LicenseV1} # a table of all possible license files
     licenses_in_project::Vector{String} # any licenses in the `license` key of the Project.toml
-    lines_of_code::Vector{LinesOfCodeV1} # table of lines of code
+    lines_of_code::Vector{LinesOfCodeV2} = upgrade_lines_of_code(lines_of_code) # table of lines of code
     contributors::Vector{ContributionsV1} # table of contributor data
     # Note: ideally this would be Union{Nothing, VersionNumber}, however
     # Arrow seems to not be able to serialize that correctly: https://github.com/apache/arrow-julia/issues/461.
     version::Union{Missing, String}=convert_version(version) # the version number, if a release was analyzed
     tree_hash::String # the tree hash of the code that was analyzed
 end
+
 function PackageV1(name, uuid, repo;
                  subdir="",
                  reachable=false,
@@ -112,14 +151,15 @@ function PackageV1(name, uuid, repo;
                  gitlab_pipeline=false,
                  license_files=LicenseV1[],
                  licenses_in_project=String[],
-                 lines_of_code=LinesOfCodeV1[],
+                 lines_of_code=LinesOfCodeV2[],
                  contributors=ContributionsV1[],
                  version=nothing,
-                 tree_hash=""
+                 tree_hash="",
                  )
     return PackageV1(; name, uuid, repo, subdir, reachable, docs, runtests, github_actions, travis,
                    appveyor, cirrus, circle, drone, buildkite, azure_pipelines, gitlab_pipeline,
-                   license_files, licenses_in_project, lines_of_code, contributors, version, tree_hash)
+                   license_files, licenses_in_project, lines_of_code, contributors, version,
+                   tree_hash)
 end
 
 function Base.show(io::IO, p::PackageV1)
@@ -142,20 +182,29 @@ function Base.show(io::IO, p::PackageV1)
           * tree hash: $(p.tree_hash)
         """
         if !isempty(p.lines_of_code)
-            l_src = count_julia_loc(p, "src")
-            l_test = count_julia_loc(p, "test")
-            l_docs = count_docs(p)
-            l_readme = count_readme(p)
+            l_src = sum_julia_loc(p, "src")
+            l_test = sum_julia_loc(p, "test")
+            l_docs = sum_doc_lines(p)
+            l_readme = sum_readme_lines(p)
 
             p_test = @sprintf("%.1f", 100 * l_test / (l_test + l_src))
             p_docs = @sprintf("%.1f", 100 * l_docs / (l_docs + l_src))
+
             body *= """
                   * Julia code in `src`: $(l_src) lines
                   * Julia code in `test`: $(l_test) lines ($(p_test)% of `test` + `src`)
                   * documentation in `docs`: $(l_docs) lines ($(p_docs)% of `docs` + `src`)
-                  * documentation in README: $(l_readme) lines
                 """
-        end
+
+            l_src_docstring = sum_docstrings(p, "src")
+            if !ismissing(l_src_docstring)
+                n = l_src_docstring + l_readme
+                p_docstrings = @sprintf("%.1f", 100 * n / (n + l_src))
+                body *= """
+                      * documentation in README & docstrings: $(n) lines ($(p_docstrings)% of README + `src`)
+                    """
+                end
+            end
         if isempty(p.license_files)
             body *= "  * no license found\n"
         else
@@ -171,9 +220,9 @@ function Base.show(io::IO, p::PackageV1)
             body *= "    * OSI approved: $(all(is_osi_approved, p.licenses_in_project))\n"
         end
         if !isempty(p.contributors)
-            n_anon = count_contributors(p; type="Anonymous")
-            body *= "  * number of contributors: $(count_contributors(p)) (and $(n_anon) anonymous contributors)\n"
-            body *= "  * number of commits: $(count_commits(p))\n"
+            n_anon = sum_contributors(p; type="Anonymous")
+            body *= "  * number of contributors: $(sum_contributors(p)) (and $(n_anon) anonymous contributors)\n"
+            body *= "  * number of commits: $(sum_commits(p))\n"
         end
         body *= """
               * has `docs/make.jl`: $(p.docs)
@@ -315,7 +364,13 @@ include("parallel.jl")
 # github, parsing
 include("utilities.jl")
 
+include("LineCategories.jl")
+using .CategorizeLines
+
 # tokei, counting
 include("count_loc.jl")
 
+include("deprecated_schemas.jl")
+
+
 end # module
diff --git a/src/analyze.jl b/src/analyze.jl
@@ -141,7 +141,7 @@ end
 Convienence function to run [`find_packages_in_manifest`](@ref) then [`analyze`](@ref) on the results. Positional argument `path_to_manifest` defaults to `joinpath(dirname(Base.active_project()), "Manifest.toml")`.
 """
 function analyze_manifest(args...; registries=reachable_registries(),
-                          auth=github_auth(), sleep=0)
+    auth=github_auth(), sleep=0)
     pkgs = find_packages_in_manifest(args...; registries)
     return analyze_packages(pkgs; auth, sleep)
 end
@@ -260,10 +260,10 @@ function analyze_code(dir::AbstractString; repo="", reachable=true, subdir="", a
             subdir_licenses_files = [LicenseV1(; license_filename=joinpath(subdir, row.license_filename), row.licenses_found, row.license_file_percent_covered) for row in _find_licenses(pkgdir)]
             license_files = [subdir_licenses_files; license_files]
         end
-        lines_of_code = count_loc(pkgdir)
+        lines_of_code = count_lines_of_code(pkgdir)
     else
         license_files = LicenseV1[]
-        lines_of_code = LinesOfCodeV1[]
+        lines_of_code = LinesOfCodeV2[]
     end
 
     if isdir(pkgdir)