Skip to content

Commit

Permalink
Add support for Java (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
jssblck authored Dec 22, 2023
1 parent d0b2cd3 commit fda3a84
Show file tree
Hide file tree
Showing 36 changed files with 3,365 additions and 784 deletions.
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# v0.2.0

Adds support for Java.

Breaking:
- Normalizations have been centralized to `snippets::parser::normalize`.
- `snippets::text::buffer` has been merged into `snippets::text`.
- Some vestigal traits (such as `snippets::text::ConvertCRLFToLF`) have been removed.
- Implementation-specific constants such as `NODE_KIND_COMMENT` have been made private.
- Removed `tree-sitter` types from the API.

# v0.1.3

Adds support for C++.
Adds support for C.

This repository initially existed in FOSSA's [foundation-libs](https://github.com/fossas/foundation-libs/tree/master/snippets) monorepo.
History for this library earlier than v0.1.3 can be viewed there.
25 changes: 18 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
[package]
name = "snippets"
version = "0.1.3"
version = "0.2.0"
edition = "2021"
description = "Extracts snippets of programming languages from files"

# This is very FOSSA specific, so don't publish to crates.io.
# Instead, just add this as a dependency using git path: `cargo add [email protected]:fossas/lib-snippets.git`.
publish = false

[features]
default = []

Expand All @@ -12,28 +16,34 @@ sha2-asm = ["sha2/asm"]

# Enables support for each named language.
# For more info, see the module documentation for the language.
lang-all = ["lang-c99-tc3", "lang-cpp-98"]
lang-c99-tc3 = []
lang-cpp-98 = []
lang-all = ["lang-c99-tc3", "lang-cpp-98", "lang-java-11"]
lang-c99-tc3 = ["tree-sitter-c"]
lang-cpp-98 = ["tree-sitter-cpp"]
lang-java-11 = ["tree-sitter-java"]

[dependencies]
base64 = "0.21.2"
bstr = "1.8.0"
colored = "2.1.0"
delegate = "0.10.0"
derivative = "2.2.0"
derive_more = "0.99.17"
fallible-iterator = { version = "0.3.0", features = ["std"] }
flagset = "0.4.3"
getset = "0.1.2"
itertools = "0.11.0"
lazy-regex = { version = "3.0.2", features = ["std"] }
nonempty = "0.9.0"
once_cell = "1.18.0"
regex = "1.9.4"
sha2 = "0.10.7"
strum = { version = "0.25.0", features = ["derive"] }
tap = "1.0.1"
thiserror = "1.0.47"
tracing = "0.1.37"
tree-sitter = "0.20.10"
tree-sitter-c = "0.20.6"
tree-sitter-cpp = "0.20.3"
tree-sitter-c = { version = "0.20.6", optional = true }
tree-sitter-cpp = { version = "0.20.3", optional = true }
tree-sitter-java = { version = "0.20.2", optional = true }
tree-sitter-traversal = "0.1.2"
typed-builder = "0.15.2"

Expand All @@ -42,6 +52,7 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
snippets = { path = ".", features = ["lang-all"] }
criterion = "0.5.1"
pretty_assertions = "1.4.0"
indoc = "2.0.4"

[[bench]]
name = "hashes"
Expand Down
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,20 @@ Name | Description | Kind
`lang-cpp` | Enables support for C++ 98. | Language
`sha2-asm` | Enables hardware acceleration for SHA2 | Performance

# History
# Compatibility

This repository initially existed in FOSSA's [foundation-libs](https://github.com/fossas/foundation-libs/tree/master/snippets) monorepo.
History for this library earlier than v0.1.3 can be viewed there.
FOSSA generally targets the latest releases of Rust, so there is no MSRV policy.
Releases will generally use the latest Rust features.

If this becomes a problem, open an issue and we can talk it out!

# Developing

## Release process

- [Check semver compatibility](https://crates.io/crates/cargo-semver-checks) to choose the version to be released.
- For example, on the feature branch you can run `cargo semver-checks --baseline-rev main`.
- Run `cargo doc --open` to validate the public API appears as you expect.
- Note that this will show docs for dependency crates as well; just look at the `snippets` crate.
- Update changelog and merge the branch to `main`.
- Release the version by tagging it. [`cargo-release`](https://crates.io/crates/cargo-release) is recommended.
8 changes: 5 additions & 3 deletions benches/hashes.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::ops::Deref;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use sha2::{Digest, Sha256};
use snippets::text::ConvertCRLFToLF;
use snippets::parser::normalize;

const INPUT: &[u8] = b"Iste nam laboriosam \r\n voluptatem \n distinctio.";

Expand Down Expand Up @@ -39,7 +41,7 @@ fn hash_vec() {

fn hash_transform_iter() {
let mut hasher = Sha256::new();
for c in INPUT.iter().copied().convert_crlf_lf() {
for &c in normalize::crlf(INPUT).deref() {
hasher.update([c]);
}
let digest = hasher.finalize().as_slice().to_vec();
Expand All @@ -49,7 +51,7 @@ fn hash_transform_iter() {

fn hash_transform_vec() {
let mut hasher = Sha256::new();
let input = INPUT.iter().copied().convert_crlf_lf().collect::<Vec<_>>();
let input = normalize::crlf(INPUT);
hasher.update(&input);
let digest = hasher.finalize().as_slice().to_vec();

Expand Down
57 changes: 57 additions & 0 deletions src/content.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
//! Specialized types for dealing with content provided to this library,
//! or reading content into this library.

use std::{borrow::Cow, path::Path};

use derivative::Derivative;
use derive_more::Index;
use tap::Pipe;

/// Specialized type to indicate the original content provided to the extractor,
/// distinct from a sliced section of that content.
#[derive(Clone, PartialEq, Eq, Derivative, Index)]
#[derivative(Debug = "transparent")]
pub struct Content(Vec<u8>);

impl Content {
/// Create a new instance with the provided content.
pub fn new(content: Vec<u8>) -> Self {
Self(content)
}

/// Read a file on disk as content.
pub fn from_file(path: impl AsRef<Path>) -> Result<Self, std::io::Error> {
std::fs::read(path).map(Self::new)
}

/// View the content as a plain byte slice.
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}

impl<U: AsRef<[u8]>> From<U> for Content {
fn from(value: U) -> Self {
value.as_ref().pipe(|v| v.to_vec()).pipe(Self)
}
}

/// Common functionality for any type indicating a section of bytes to extract from [`Content`].
pub trait ByteCoordinate {
/// The byte offset at which the function starts.
fn byte_start(&self) -> usize;

/// The byte offset at which the function ends.
fn byte_end(&self) -> usize;

/// Extract the text representing this part from the specified content.
fn extract_from<'a>(&self, content: &'a Content) -> &'a [u8] {
&content[self.byte_start()..self.byte_end()]
}

/// Extract the text representing this part from the specified content as a lossy string.
fn extract_from_lossy<'a>(&self, content: &'a Content) -> Cow<'a, str> {
let content = self.extract_from(content);
String::from_utf8_lossy(content)
}
}
64 changes: 64 additions & 0 deletions src/debugging.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
//! Provides debugging helpers to snippet extractors.

use tap::{Pipe, TapOptional};
use tracing::trace;
use tree_sitter::Node;

use crate::impl_prelude::*;

/// Conversion trait for types that can be represented with [`EscapedText`].
pub trait ToDisplayEscaped {
fn display_escaped(&self) -> EscapedText;
Expand All @@ -25,3 +31,61 @@ impl<'a> std::fmt::Display for EscapedText<'a> {
Ok(())
}
}

pub(crate) fn inspect_node(node: &Node<'_>, content: &[u8]) {
let location = node.byte_range().pipe(SnippetLocation::from);
if node.is_error() {
let start = node.start_position();
let end = node.end_position();
trace!(
%location,
content = %location.extract_from(content).display_escaped(),
kind = %"syntax_error",
line_start = start.row,
line_end = end.row,
col_start = start.column,
col_end = end.column,
);
} else {
trace!(
%location,
content = %location.extract_from(content).display_escaped(),
kind = %node.kind(),
);
}
}

/// Convenience implementation for inspecting [`Node`] values
/// in an iterator as they are iterated.
pub trait NodeInspector<'a, I> {
/// Inspect each [`Node`] as it is iterated,
/// tracing the node content and shape.
fn inspect_nodes(self, content: &'a [u8]) -> InspectedNodes<'a, I>;
}

#[derive(Debug)]
pub struct InspectedNodes<'a, I> {
content: &'a [u8],
iter: I,
}

impl<'a, 'b, I: Iterator<Item = Node<'b>>> NodeInspector<'a, I> for I {
/// Inspect each [`Node`] as it is iterated,
/// tracing the node content and shape.
fn inspect_nodes(self, content: &'a [u8]) -> InspectedNodes<'a, I> {
InspectedNodes {
content,
iter: self,
}
}
}

impl<'a, 'b, I: Iterator<Item = Node<'b>>> Iterator for InspectedNodes<'a, I> {
type Item = Node<'b>;

fn next(&mut self) -> Option<Self::Item> {
self.iter
.next()
.tap_some(|node| inspect_node(node, self.content))
}
}
3 changes: 3 additions & 0 deletions src/ext.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
//! Extensions for types.

pub mod vec;
94 changes: 94 additions & 0 deletions src/ext/vec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
//! Extensions to [`Vec`] or Vec-like types.

use nonempty::{nonempty, NonEmpty};

/// Extends [`Vec`] to make it usable in a more functional style.
pub trait FunctionalVec<T> {
/// Push the new element into self, returning the modified form.
fn pushed(self, new: T) -> Self;

/// Extend self with the provided [`IntoIterator`], returning the modified form.
fn extended(self, new: impl IntoIterator<Item = T>) -> Self;

/// Reverse self, returning the modified form.
fn reversed(self) -> Self;

/// Prepend self with the provided item.
fn prepended(self, new: T) -> Self;
}

impl<T> FunctionalVec<T> for Vec<T> {
fn pushed(mut self, new: T) -> Self {
self.push(new);
self
}

fn extended(mut self, new: impl IntoIterator<Item = T>) -> Self {
self.extend(new);
self
}

fn reversed(mut self) -> Self {
self.reverse();
self
}

fn prepended(self, new: T) -> Self {
vec![new].extended(self)
}
}

impl<T> FunctionalVec<T> for NonEmpty<T> {
fn pushed(mut self, new: T) -> Self {
self.push(new);
self
}

fn extended(mut self, new: impl IntoIterator<Item = T>) -> Self {
self.extend(new);
self
}

fn reversed(self) -> Self {
let mut tail = self.tail;
let old_head = self.head;

if let Some(head) = tail.pop() {
NonEmpty::new(head).extended(tail.reversed().pushed(old_head))
} else {
NonEmpty::new(old_head).extended(tail)
}
}

fn prepended(self, new: T) -> Self {
nonempty![new].extended(self)
}
}

#[cfg(test)]
mod tests {
use nonempty::nonempty;
use pretty_assertions::assert_eq;

use super::*;

#[test]
fn vec() {
let start = Vec::new;
assert_eq!(Vec::<i32>::new(), start().reversed());
assert_eq!(vec![1], start().pushed(1));
assert_eq!(vec![1, 2], start().pushed(1).pushed(2));
assert_eq!(vec![1, 2], start().extended([1, 2]));
assert_eq!(vec![2, 1], start().extended([1, 2]).reversed());
}

#[test]
fn nonempty() {
let start = || nonempty![1];
assert_eq!(nonempty![1], start());
assert_eq!(nonempty![1], start().reversed());
assert_eq!(nonempty![1, 2], start().pushed(2));
assert_eq!(nonempty![1, 2, 3], start().extended([2, 3]));
assert_eq!(nonempty![3, 2, 1], start().extended([2, 3]).reversed());
}
}
8 changes: 2 additions & 6 deletions src/language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,5 @@ pub mod c99_tc3;
#[cfg(feature = "lang-cpp-98")]
pub mod cpp_98;

pub use normalize_code::*;
pub use normalize_comments::*;

mod normalize_code;
mod normalize_comments;
mod snippet_context;
#[cfg(feature = "lang-java-11")]
pub mod java_11;
Loading

0 comments on commit fda3a84

Please sign in to comment.