diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..a47d238 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,18 @@ +# v0.2.0 + +Adds support for Java. + +Breaking: +- Normalizations have been centralized to `snippets::parser::normalize`. +- `snippets::text::buffer` has been merged into `snippets::text`. +- Some vestigal traits (such as `snippets::text::ConvertCRLFToLF`) have been removed. +- Implementation-specific constants such as `NODE_KIND_COMMENT` have been made private. +- Removed `tree-sitter` types from the API. + +# v0.1.3 + +Adds support for C++. +Adds support for C. + +This repository initially existed in FOSSA's [foundation-libs](https://github.com/fossas/foundation-libs/tree/master/snippets) monorepo. +History for this library earlier than v0.1.3 can be viewed there. diff --git a/Cargo.toml b/Cargo.toml index 9d91b74..6c61d0b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,13 @@ [package] name = "snippets" -version = "0.1.3" +version = "0.2.0" edition = "2021" description = "Extracts snippets of programming languages from files" +# This is very FOSSA specific, so don't publish to crates.io. +# Instead, just add this as a dependency using git path: `cargo add git@github.com:fossas/lib-snippets.git`. +publish = false + [features] default = [] @@ -12,28 +16,34 @@ sha2-asm = ["sha2/asm"] # Enables support for each named language. # For more info, see the module documentation for the language. -lang-all = ["lang-c99-tc3", "lang-cpp-98"] -lang-c99-tc3 = [] -lang-cpp-98 = [] +lang-all = ["lang-c99-tc3", "lang-cpp-98", "lang-java-11"] +lang-c99-tc3 = ["tree-sitter-c"] +lang-cpp-98 = ["tree-sitter-cpp"] +lang-java-11 = ["tree-sitter-java"] [dependencies] base64 = "0.21.2" +bstr = "1.8.0" +colored = "2.1.0" +delegate = "0.10.0" derivative = "2.2.0" derive_more = "0.99.17" fallible-iterator = { version = "0.3.0", features = ["std"] } flagset = "0.4.3" getset = "0.1.2" itertools = "0.11.0" +lazy-regex = { version = "3.0.2", features = ["std"] } +nonempty = "0.9.0" once_cell = "1.18.0" -regex = "1.9.4" sha2 = "0.10.7" strum = { version = "0.25.0", features = ["derive"] } tap = "1.0.1" thiserror = "1.0.47" tracing = "0.1.37" tree-sitter = "0.20.10" -tree-sitter-c = "0.20.6" -tree-sitter-cpp = "0.20.3" +tree-sitter-c = { version = "0.20.6", optional = true } +tree-sitter-cpp = { version = "0.20.3", optional = true } +tree-sitter-java = { version = "0.20.2", optional = true } tree-sitter-traversal = "0.1.2" typed-builder = "0.15.2" @@ -42,6 +52,7 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } snippets = { path = ".", features = ["lang-all"] } criterion = "0.5.1" pretty_assertions = "1.4.0" +indoc = "2.0.4" [[bench]] name = "hashes" diff --git a/README.md b/README.md index 0437f44..b585fd3 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,20 @@ Name | Description | Kind `lang-cpp` | Enables support for C++ 98. | Language `sha2-asm` | Enables hardware acceleration for SHA2 | Performance -# History +# Compatibility -This repository initially existed in FOSSA's [foundation-libs](https://github.com/fossas/foundation-libs/tree/master/snippets) monorepo. -History for this library earlier than v0.1.3 can be viewed there. +FOSSA generally targets the latest releases of Rust, so there is no MSRV policy. +Releases will generally use the latest Rust features. + +If this becomes a problem, open an issue and we can talk it out! + +# Developing + +## Release process + +- [Check semver compatibility](https://crates.io/crates/cargo-semver-checks) to choose the version to be released. + - For example, on the feature branch you can run `cargo semver-checks --baseline-rev main`. +- Run `cargo doc --open` to validate the public API appears as you expect. + - Note that this will show docs for dependency crates as well; just look at the `snippets` crate. +- Update changelog and merge the branch to `main`. +- Release the version by tagging it. [`cargo-release`](https://crates.io/crates/cargo-release) is recommended. diff --git a/benches/hashes.rs b/benches/hashes.rs index b7d0f0a..5d6f7a9 100644 --- a/benches/hashes.rs +++ b/benches/hashes.rs @@ -1,6 +1,8 @@ +use std::ops::Deref; + use criterion::{black_box, criterion_group, criterion_main, Criterion}; use sha2::{Digest, Sha256}; -use snippets::text::ConvertCRLFToLF; +use snippets::parser::normalize; const INPUT: &[u8] = b"Iste nam laboriosam \r\n voluptatem \n distinctio."; @@ -39,7 +41,7 @@ fn hash_vec() { fn hash_transform_iter() { let mut hasher = Sha256::new(); - for c in INPUT.iter().copied().convert_crlf_lf() { + for &c in normalize::crlf(INPUT).deref() { hasher.update([c]); } let digest = hasher.finalize().as_slice().to_vec(); @@ -49,7 +51,7 @@ fn hash_transform_iter() { fn hash_transform_vec() { let mut hasher = Sha256::new(); - let input = INPUT.iter().copied().convert_crlf_lf().collect::>(); + let input = normalize::crlf(INPUT); hasher.update(&input); let digest = hasher.finalize().as_slice().to_vec(); diff --git a/src/content.rs b/src/content.rs new file mode 100644 index 0000000..9478745 --- /dev/null +++ b/src/content.rs @@ -0,0 +1,57 @@ +//! Specialized types for dealing with content provided to this library, +//! or reading content into this library. + +use std::{borrow::Cow, path::Path}; + +use derivative::Derivative; +use derive_more::Index; +use tap::Pipe; + +/// Specialized type to indicate the original content provided to the extractor, +/// distinct from a sliced section of that content. +#[derive(Clone, PartialEq, Eq, Derivative, Index)] +#[derivative(Debug = "transparent")] +pub struct Content(Vec); + +impl Content { + /// Create a new instance with the provided content. + pub fn new(content: Vec) -> Self { + Self(content) + } + + /// Read a file on disk as content. + pub fn from_file(path: impl AsRef) -> Result { + std::fs::read(path).map(Self::new) + } + + /// View the content as a plain byte slice. + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } +} + +impl> From for Content { + fn from(value: U) -> Self { + value.as_ref().pipe(|v| v.to_vec()).pipe(Self) + } +} + +/// Common functionality for any type indicating a section of bytes to extract from [`Content`]. +pub trait ByteCoordinate { + /// The byte offset at which the function starts. + fn byte_start(&self) -> usize; + + /// The byte offset at which the function ends. + fn byte_end(&self) -> usize; + + /// Extract the text representing this part from the specified content. + fn extract_from<'a>(&self, content: &'a Content) -> &'a [u8] { + &content[self.byte_start()..self.byte_end()] + } + + /// Extract the text representing this part from the specified content as a lossy string. + fn extract_from_lossy<'a>(&self, content: &'a Content) -> Cow<'a, str> { + let content = self.extract_from(content); + String::from_utf8_lossy(content) + } +} diff --git a/src/debugging.rs b/src/debugging.rs index 2292206..b7b587b 100644 --- a/src/debugging.rs +++ b/src/debugging.rs @@ -1,5 +1,11 @@ //! Provides debugging helpers to snippet extractors. +use tap::{Pipe, TapOptional}; +use tracing::trace; +use tree_sitter::Node; + +use crate::impl_prelude::*; + /// Conversion trait for types that can be represented with [`EscapedText`]. pub trait ToDisplayEscaped { fn display_escaped(&self) -> EscapedText; @@ -25,3 +31,61 @@ impl<'a> std::fmt::Display for EscapedText<'a> { Ok(()) } } + +pub(crate) fn inspect_node(node: &Node<'_>, content: &[u8]) { + let location = node.byte_range().pipe(SnippetLocation::from); + if node.is_error() { + let start = node.start_position(); + let end = node.end_position(); + trace!( + %location, + content = %location.extract_from(content).display_escaped(), + kind = %"syntax_error", + line_start = start.row, + line_end = end.row, + col_start = start.column, + col_end = end.column, + ); + } else { + trace!( + %location, + content = %location.extract_from(content).display_escaped(), + kind = %node.kind(), + ); + } +} + +/// Convenience implementation for inspecting [`Node`] values +/// in an iterator as they are iterated. +pub trait NodeInspector<'a, I> { + /// Inspect each [`Node`] as it is iterated, + /// tracing the node content and shape. + fn inspect_nodes(self, content: &'a [u8]) -> InspectedNodes<'a, I>; +} + +#[derive(Debug)] +pub struct InspectedNodes<'a, I> { + content: &'a [u8], + iter: I, +} + +impl<'a, 'b, I: Iterator>> NodeInspector<'a, I> for I { + /// Inspect each [`Node`] as it is iterated, + /// tracing the node content and shape. + fn inspect_nodes(self, content: &'a [u8]) -> InspectedNodes<'a, I> { + InspectedNodes { + content, + iter: self, + } + } +} + +impl<'a, 'b, I: Iterator>> Iterator for InspectedNodes<'a, I> { + type Item = Node<'b>; + + fn next(&mut self) -> Option { + self.iter + .next() + .tap_some(|node| inspect_node(node, self.content)) + } +} diff --git a/src/ext.rs b/src/ext.rs new file mode 100644 index 0000000..cf369f2 --- /dev/null +++ b/src/ext.rs @@ -0,0 +1,3 @@ +//! Extensions for types. + +pub mod vec; diff --git a/src/ext/vec.rs b/src/ext/vec.rs new file mode 100644 index 0000000..cd6e646 --- /dev/null +++ b/src/ext/vec.rs @@ -0,0 +1,94 @@ +//! Extensions to [`Vec`] or Vec-like types. + +use nonempty::{nonempty, NonEmpty}; + +/// Extends [`Vec`] to make it usable in a more functional style. +pub trait FunctionalVec { + /// Push the new element into self, returning the modified form. + fn pushed(self, new: T) -> Self; + + /// Extend self with the provided [`IntoIterator`], returning the modified form. + fn extended(self, new: impl IntoIterator) -> Self; + + /// Reverse self, returning the modified form. + fn reversed(self) -> Self; + + /// Prepend self with the provided item. + fn prepended(self, new: T) -> Self; +} + +impl FunctionalVec for Vec { + fn pushed(mut self, new: T) -> Self { + self.push(new); + self + } + + fn extended(mut self, new: impl IntoIterator) -> Self { + self.extend(new); + self + } + + fn reversed(mut self) -> Self { + self.reverse(); + self + } + + fn prepended(self, new: T) -> Self { + vec![new].extended(self) + } +} + +impl FunctionalVec for NonEmpty { + fn pushed(mut self, new: T) -> Self { + self.push(new); + self + } + + fn extended(mut self, new: impl IntoIterator) -> Self { + self.extend(new); + self + } + + fn reversed(self) -> Self { + let mut tail = self.tail; + let old_head = self.head; + + if let Some(head) = tail.pop() { + NonEmpty::new(head).extended(tail.reversed().pushed(old_head)) + } else { + NonEmpty::new(old_head).extended(tail) + } + } + + fn prepended(self, new: T) -> Self { + nonempty![new].extended(self) + } +} + +#[cfg(test)] +mod tests { + use nonempty::nonempty; + use pretty_assertions::assert_eq; + + use super::*; + + #[test] + fn vec() { + let start = Vec::new; + assert_eq!(Vec::::new(), start().reversed()); + assert_eq!(vec![1], start().pushed(1)); + assert_eq!(vec![1, 2], start().pushed(1).pushed(2)); + assert_eq!(vec![1, 2], start().extended([1, 2])); + assert_eq!(vec![2, 1], start().extended([1, 2]).reversed()); + } + + #[test] + fn nonempty() { + let start = || nonempty![1]; + assert_eq!(nonempty![1], start()); + assert_eq!(nonempty![1], start().reversed()); + assert_eq!(nonempty![1, 2], start().pushed(2)); + assert_eq!(nonempty![1, 2, 3], start().extended([2, 3])); + assert_eq!(nonempty![3, 2, 1], start().extended([2, 3]).reversed()); + } +} diff --git a/src/language.rs b/src/language.rs index 6608f57..2412719 100644 --- a/src/language.rs +++ b/src/language.rs @@ -9,9 +9,5 @@ pub mod c99_tc3; #[cfg(feature = "lang-cpp-98")] pub mod cpp_98; -pub use normalize_code::*; -pub use normalize_comments::*; - -mod normalize_code; -mod normalize_comments; -mod snippet_context; +#[cfg(feature = "lang-java-11")] +pub mod java_11; diff --git a/src/language/c99_tc3.rs b/src/language/c99_tc3.rs index 9d8e985..3c63bcd 100644 --- a/src/language/c99_tc3.rs +++ b/src/language/c99_tc3.rs @@ -35,15 +35,11 @@ use tracing::{debug, warn}; use tree_sitter::Node; use tree_sitter_traversal::{traverse, traverse_tree, Order}; -use crate::debugging::ToDisplayEscaped; -use crate::text::normalize_space; -use crate::tree_sitter_consts::{NODE_KIND_FUNC_DEF, NODE_KIND_OPEN_BRACE}; +use crate::content::Content; +use crate::debugging::{inspect_node, ToDisplayEscaped}; +use crate::parser::{normalize, NODE_KIND_FUNC_DEF, NODE_KIND_OPEN_BRACE}; use crate::{impl_language, impl_prelude::*}; -use super::normalize_code::normalize_code; -use super::normalize_comments::normalize_comments; -use super::snippet_context::SnippetContext; - /// This module implements support for C99 TC3. /// /// Review module documentation for more details. @@ -63,16 +59,21 @@ pub struct Extractor; // If you make changes to this extractor, consider if they should also be made to the cpp_98 extractor // or if the functionality makes sense to be shared. impl SnippetExtractor for Extractor { - type Language = Language; - - #[tracing::instrument(skip_all, fields(kinds = %opts.kinds(), transforms = %opts.transforms(), content_len = content.as_ref().len()))] - fn extract( - opts: &SnippetOptions, - content: impl AsRef<[u8]>, - ) -> Result>, ExtractorError> { + type Options = SnippetOptions; + type Output = Vec>; + + #[tracing::instrument( + skip_all, + fields( + kinds = %opts.kinds(), + methods = %opts.methods(), + content_len = content.as_bytes().len(), + ) + )] + fn extract(opts: &Self::Options, content: &Content) -> Result { let mut parser = init_parser()?; - let content = content.as_ref(); + let content = content.as_bytes(); let Some(tree) = parser.parse(content, None) else { warn!("provided content did not parse to a tree"); return Vec::new().pipe(Ok); @@ -109,7 +110,7 @@ fn extract( meta: SnippetMetadata, node: Node<'_>, content: &[u8], -) -> Option, ExtractorError>> { +) -> Option, Error>> { match target { SnippetTarget::Function => extract_function(meta, node, content), } @@ -120,7 +121,7 @@ fn extract_function( meta: SnippetMetadata, node: Node<'_>, content: &[u8], -) -> Option, ExtractorError>> { +) -> Option, Error>> { // The raw content here is just extracted for debugging. let raw = meta.location().extract_from(content); debug!(raw = %raw.display_escaped()); @@ -255,9 +256,9 @@ fn extract_text<'a>(method: SnippetMethod, context: &'a SnippetContext) -> Cow<' #[tracing::instrument(skip_all)] fn transform<'a>(transform: SnippetTransform, context: &'a SnippetContext) -> Cow<'a, [u8]> { match transform { - SnippetTransform::Code => normalize_code(context), - SnippetTransform::Comment => normalize_comments(context).into(), - SnippetTransform::Space => normalize_space(context.content()), + SnippetTransform::Code => normalize::code(context), + SnippetTransform::Comment => normalize::comments(context).into(), + SnippetTransform::Space => normalize::space(context.content()), } } @@ -272,34 +273,12 @@ fn matches_target(target: SnippetTarget, node: Node<'_>) -> bool { } } -#[tracing::instrument(skip_all)] -fn inspect_node(node: &Node<'_>, content: &[u8]) { - let location = node.byte_range().pipe(SnippetLocation::from); - if node.is_error() { - let start = node.start_position(); - let end = node.end_position(); - warn!( - %location, - content = %location.extract_from(content).display_escaped(), - kind = %"syntax_error", - line_start = start.row, - line_end = end.row, - col_start = start.column, - col_end = end.column, - ); - } else { - debug!( - %location, - content = %location.extract_from(content).display_escaped(), - kind = %node.kind(), - ); - } -} - #[tracing::instrument] -fn init_parser() -> Result { +pub(crate) fn init_parser() -> Result { let mut parser = tree_sitter::Parser::new(); - parser.set_language(tree_sitter_c::language())?; + parser + .set_language(tree_sitter_c::language()) + .map_err(Error::configure)?; Ok(parser) } diff --git a/src/language/cpp_98.rs b/src/language/cpp_98.rs index 53ea890..d9c1289 100644 --- a/src/language/cpp_98.rs +++ b/src/language/cpp_98.rs @@ -42,15 +42,11 @@ use tracing::{debug, warn}; use tree_sitter::Node; use tree_sitter_traversal::{traverse, traverse_tree, Order}; -use crate::debugging::ToDisplayEscaped; -use crate::text::normalize_space; -use crate::tree_sitter_consts::{NODE_KIND_FUNC_DEF, NODE_KIND_OPEN_BRACE}; +use crate::content::Content; +use crate::debugging::{inspect_node, ToDisplayEscaped}; +use crate::parser::{normalize, NODE_KIND_FUNC_DEF, NODE_KIND_OPEN_BRACE}; use crate::{impl_language, impl_prelude::*}; -use super::normalize_code::normalize_code; -use super::normalize_comments::normalize_comments; -use super::snippet_context::SnippetContext; - /// This module implements support for CPP 98. /// /// Review module documentation for more details. @@ -71,15 +67,21 @@ pub struct Extractor; // If you make changes to this extractor, consider if they should also be made to the c99_tc3 extractor // or if the functionality makes sense to be shared. impl SnippetExtractor for Extractor { - type Language = Language; - - fn extract( - opts: &SnippetOptions, - content: impl AsRef<[u8]>, - ) -> Result>, ExtractorError> { + type Options = SnippetOptions; + type Output = Vec>; + + #[tracing::instrument( + skip_all, + fields( + kinds = %opts.kinds(), + methods = %opts.methods(), + content_len = content.as_bytes().len(), + ) + )] + fn extract(opts: &Self::Options, content: &Content) -> Result { let mut parser = init_parser()?; - let content = content.as_ref(); + let content = content.as_bytes(); let Some(tree) = parser.parse(content, None) else { warn!("provided content did not parse to a tree"); return Vec::new().pipe(Ok); @@ -116,7 +118,7 @@ fn extract( meta: SnippetMetadata, node: Node<'_>, content: &[u8], -) -> Option, ExtractorError>> { +) -> Option, Error>> { match target { SnippetTarget::Function => extract_function(meta, node, content), } @@ -127,7 +129,7 @@ fn extract_function( meta: SnippetMetadata, node: Node<'_>, content: &[u8], -) -> Option, ExtractorError>> { +) -> Option, Error>> { // The raw content here is just extracted for debugging. let raw = meta.location().extract_from(content); debug!(raw = %raw.display_escaped()); @@ -262,9 +264,9 @@ fn extract_text<'a>(method: SnippetMethod, context: &'a SnippetContext) -> Cow<' #[tracing::instrument(skip_all)] fn transform<'a>(transform: SnippetTransform, context: &'a SnippetContext) -> Cow<'a, [u8]> { match transform { - SnippetTransform::Code => normalize_code(context), - SnippetTransform::Comment => normalize_comments(context).into(), - SnippetTransform::Space => normalize_space(context.content()), + SnippetTransform::Code => normalize::code(context), + SnippetTransform::Comment => normalize::comments(context).into(), + SnippetTransform::Space => normalize::space(context.content()), } } @@ -279,34 +281,12 @@ fn matches_target(target: SnippetTarget, node: Node<'_>) -> bool { } } -#[tracing::instrument(skip_all)] -fn inspect_node(node: &Node<'_>, content: &[u8]) { - let location = node.byte_range().pipe(SnippetLocation::from); - if node.is_error() { - let start = node.start_position(); - let end = node.end_position(); - warn!( - %location, - content = %location.extract_from(content).display_escaped(), - kind = %"syntax_error", - line_start = start.row, - line_end = end.row, - col_start = start.column, - col_end = end.column, - ); - } else { - debug!( - %location, - content = %location.extract_from(content).display_escaped(), - kind = %node.kind(), - ); - } -} - #[tracing::instrument] -fn init_parser() -> Result { +pub(crate) fn init_parser() -> Result { let mut parser = tree_sitter::Parser::new(); - parser.set_language(tree_sitter_cpp::language())?; + parser + .set_language(tree_sitter_cpp::language()) + .map_err(Error::configure)?; Ok(parser) } @@ -323,6 +303,7 @@ impl<'a> FunctionParts<'a> { /// As a performance optimization, if the metadata only asks for the signature, /// body nodes are not stored. They are still traversed, in case treesitter /// iterates over nodes out of order. + #[tracing::instrument(skip_all, fields(meta))] fn from(meta: SnippetMetadata, node: Node<'a>, content: &'a [u8]) -> Self { let nodes = traverse(node.walk(), Order::Pre).inspect(|node| inspect_node(node, content)); diff --git a/src/language/java_11.rs b/src/language/java_11.rs new file mode 100644 index 0000000..87eee13 --- /dev/null +++ b/src/language/java_11.rs @@ -0,0 +1,310 @@ +//! Implements an [`Extractor`] for the Java programming language. +//! +//! # Version +//! +//! This is based on the grammar available at https://github.com/tree-sitter/tree-sitter-java. +//! It's not 100% clear which version of Java the grammar supports, +//! but some of the contents of the codebase imply that it supports Java 11. +//! +//! That being said, this extractor should generally support newer syntax, +//! so long as the new syntax doesn't prevent parsing functions out of the source code. +//! +//! [`Extractor`]: crate::Extractor + +use tap::{Pipe, Tap}; +use tracing::{debug, warn}; +use tree_sitter_traversal::{traverse_tree, Order}; + +use crate::{ + content::Content, + debugging::{inspect_node, NodeInspector, ToDisplayEscaped}, + impl_language, + impl_prelude::*, + parser::{ + java, stack::Stack, Argument, Arguments, Label, Parameter, Parameters, Scope, Visibility, + NODE_KIND_CONSTRUCTOR_DECL, NODE_KIND_METHOD_DECL, + }, +}; + +/// This module implements support for Java 11. +/// +/// Review module documentation for more details. +#[derive(Copy, Clone)] +pub struct Language; + +impl SnippetLanguage for Language { + const NAME: &'static str = "java_11"; + const STRATEGY: LanguageStrategy = LanguageStrategy::Static; +} + +impl_language!(Language); + +/// An empty struct used when no options are accepted. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct EmptyOptions; + +/// Extracts standard snippets from source code. +/// +/// All targets extracted are extracted with the equivalent of +/// [`SnippetTarget::Function`], [`SnippetKind::Full`], and [`SnippetMethod::Raw`]. +pub struct Extractor; + +impl SnippetExtractor for Extractor { + type Options = EmptyOptions; + type Output = Vec>; + + #[tracing::instrument(skip_all, fields(content_len = content.as_bytes().len()))] + fn extract(_: &Self::Options, content: &Content) -> Result { + let mut parser = parser()?; + + let content = content.as_bytes(); + let Some(tree) = parser.parse(content, None) else { + warn!("provided content did not parse to a tree"); + return Vec::new().pipe(Ok); + }; + + traverse_tree(&tree, Order::Pre) + // Report syntax errors as warnings. + // Always write a debugging line for each node, regardless of the kind of node. + .inspect(|node| inspect_node(node, content)) + // Nodes that are not "named" are syntax, + // which this function currently ignores. + // + // Reference: + // https://tree-sitter.github.io/tree-sitter/using-parsers#named-vs-anonymous-nodes + .filter(|node| node.is_named()) + // Filter to the kinds this extractor cares about. + .filter(|node| { + let kind = node.kind(); + kind == NODE_KIND_METHOD_DECL || kind == NODE_KIND_CONSTRUCTOR_DECL + }) + // Hand each node off to be processed into a possible snippet, + // based on the provided options. + .map(|node| { + node.byte_range() + .pipe(SnippetLocation::from) + .pipe(|loc| extract_function(loc, node, content)) + }) + // Then just collect all the produced snippets and done! + // `From for Result` stops iteration on the first error as well. + .collect() + } +} + +#[tracing::instrument(skip_all)] +fn extract_function( + loc: SnippetLocation, + node: tree_sitter::Node<'_>, + content: &[u8], +) -> Result, Error> { + // The raw content here is just extracted for debugging. + let raw = loc.extract_from(content); + debug!(raw = %raw.display_escaped()); + + // The actual context, the part the snippet scanner cares about, is built here. + // + // In the future, this will likely be extended to support other kinds of context, + // similar to the C/C++ languages. + let context = SnippetContext::new(node, loc, content); + debug!(context = %context.content().display_escaped()); + + // Transformations would normally be applied on text extracted from the context. + // However, this language doesn't support transforms yet, so no transforms are performed. + let text = context.content(); + debug!(text = %text.display_escaped()); + + // The more exact location generated above overwrites the overall node location, + // otherwise users would just always see the whole node. + // + // For now, this also asserts their kind and method. + // This is done so that the output shape is compatible with other snippets. + let meta = SnippetMetadata::new(SnippetKind::Full, SnippetMethod::Raw, context.location()); + Snippet::from(meta, text) + .tap(|snippet| debug!(fingerprint = %snippet.fingerprint())) + .pipe(Ok) +} + +/// Extracts function call graphs from source code. +pub struct CallGraphExtractor; + +impl SnippetExtractor for CallGraphExtractor { + type Options = EmptyOptions; + type Output = Stack; + + #[tracing::instrument(skip_all)] + fn extract(_: &Self::Options, content: &Content) -> Result { + let mut parser = parser()?; + + let content = content.as_bytes(); + let Some(tree) = parser.parse(content, None) else { + warn!("provided content did not parse to a tree"); + return Stack::default().pipe(Ok); + }; + + // As the content is parsed, it can't be collapsed + // into something simple like a lookup table; + // this is because Java is a scoped language so the same + // name may indicate multiple symbol paths at different scopes. + // + // Instead, this parser builds a stack of symbols, + // performing a naive search of the entire stack (from front to back) + // to resolve names into their fully qualified symbols. + // + // Symbols that are not found in the stack are assumed syntactically correct, + // and that they are declared in a different file in the same package. + // + // Note that this is not a generalized parser for Java; + // it is specific to _methods_ and therefore only stores + // symbols that are required for resolving methods to their + // declarations. As a concrete example, this parser ignores enums, + // because they are not relevant for looking up methods. + let mut stack = Stack::default(); + + // Build the stack. Reporting the call graph is a two-phase operation + // becuase each given symbol may depend on things that come + // later in the file. + for node in traverse_tree(&tree, Order::Pre).inspect_nodes(content) { + if let Some(scope) = java::scope(node, content) { + match scope { + Scope::Enter(location) => stack.enter(location), + Scope::Exit(location) => stack.exit(location), + } + continue; + } + + if let Some(symbol) = java::symbol(node, content) { + stack.push(symbol); + continue; + } + } + + Ok(stack) + } +} + +/// A node in the file syntax tree. +#[derive(Clone, Eq, PartialEq, Debug, Hash, strum::Display)] +#[strum(serialize_all = "snake_case")] +#[non_exhaustive] +pub enum Node { + /// Represents a package name. + Package { name: Label }, + + /// Represents an import. + Import { name: Label }, + + /// Represents a class name. + Class { name: Label, visibility: Visibility }, + + /// Represents a constructor of a class. + Constructor { + name: Label, + params: Parameters, + visibility: Visibility, + }, + + /// Represents a method on a class. + Method { + name: Label, + params: Parameters, + visibility: Visibility, + }, + + /// Represents a method invocation. + /// + /// `target` is the name of the symbol on which the method was invoked; + /// this likely needs to be resolved in the current scope + /// (for example it is likely a variable or an import). + Invocation { + name: Label, + args: Arguments, + target: Label, + }, + + /// Represents a variable. + Variable { + name: Label, + type_name: Label, + visibility: Visibility, + }, +} + +impl Node { + pub fn new_package(name: impl Into