Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kernel/Ext2FS: Add full support for large inodes #25456

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/serenity-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ jobs:
echo "::group::ninja run # Qemu output"
ninja run
echo "::endgroup::"
echo "::group::Check Filesystem Consistency"
e2fsck -f -y _disk_image
echo "::endgroup::"
echo "::group::Verify Output File"
mkdir fsmount
sudo mount -t ext2 -o loop,rw _disk_image fsmount
Expand Down
5 changes: 5 additions & 0 deletions Kernel/FileSystem/Ext2FS/Definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -409,10 +409,15 @@ struct ext2_inode_large {
__u32 i_crtime; /* File creation time */
__u32 i_crtime_extra; /* extra File creation time (nsec << 2 | epoch)*/
__u32 i_version_hi; /* high 32 bits for 64-bit version */
__u32 i_projid; /* Project ID */
};

#define i_size_high i_dir_acl

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS)

#if defined(__KERNEL__) || defined(__linux__)
# define i_reserved1 osd1.linux1.l_i_reserved1
# define i_frag osd2.linux2.l_i_frag
Expand Down
49 changes: 39 additions & 10 deletions Kernel/FileSystem/Ext2FS/FileSystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,18 +210,28 @@ u64 Ext2FS::inode_size() const
{
return EXT2_INODE_SIZE(&super_block());
}

u64 Ext2FS::blocks_per_group() const
{
return EXT2_BLOCKS_PER_GROUP(&super_block());
}

ErrorOr<void> Ext2FS::write_ext2_inode(InodeIndex inode, ext2_inode const& e2inode)
ErrorOr<void> Ext2FS::write_ext2_inode(InodeIndex inode, ext2_inode_large const& e2inode)
{
BlockIndex block_index;
unsigned offset;
if (!find_block_containing_inode(inode, block_index, offset))
return EINVAL;
auto buffer = UserOrKernelBuffer::for_kernel_buffer(const_cast<u8*>((u8 const*)&e2inode));

Vector<u8> inode_storage;
TRY(inode_storage.try_resize(inode_size()));

size_t used_inode_size = inode_size() > EXT2_GOOD_OLD_INODE_SIZE ? EXT2_GOOD_OLD_INODE_SIZE + e2inode.i_extra_isize : inode_size();
VERIFY(used_inode_size >= EXT2_GOOD_OLD_INODE_SIZE && used_inode_size <= inode_size());

memcpy(inode_storage.data(), &e2inode, min(used_inode_size, sizeof(ext2_inode_large)));

auto buffer = UserOrKernelBuffer::for_kernel_buffer(inode_storage.data());
return write_block(block_index, buffer, inode_size(), offset);
}

Expand Down Expand Up @@ -493,20 +503,31 @@ ErrorOr<NonnullRefPtr<Inode>> Ext2FS::create_inode(Ext2FSInode& parent_inode, St
if (parent_inode.m_raw_inode.i_links_count == 0)
return ENOENT;

ext2_inode e2inode {};
auto now = kgettimeofday().truncated_seconds_since_epoch();
ext2_inode_large e2inode {};
auto now = kgettimeofday().to_timespec();

u32 extra = Ext2FSInode::encode_time_to_extra(now.tv_sec, now.tv_nsec);

e2inode.i_mode = mode;
e2inode.i_uid = static_cast<u16>(uid.value());
ext2fs_set_i_uid_high(e2inode, uid.value() >> 16);
e2inode.i_gid = static_cast<u16>(gid.value());
ext2fs_set_i_gid_high(e2inode, gid.value() >> 16);
e2inode.i_size = 0;
e2inode.i_atime = now;
e2inode.i_ctime = now;
e2inode.i_mtime = now;
e2inode.i_atime = now.tv_sec;
e2inode.i_ctime = now.tv_sec;
e2inode.i_mtime = now.tv_sec;
e2inode.i_crtime = now.tv_sec;
e2inode.i_atime_extra = extra;
e2inode.i_ctime_extra = extra;
e2inode.i_mtime_extra = extra;
e2inode.i_crtime_extra = extra;
e2inode.i_dtime = 0;
e2inode.i_flags = 0;

if (inode_size() > EXT2_GOOD_OLD_INODE_SIZE)
e2inode.i_extra_isize = min(inode_size(), sizeof(ext2_inode_large)) - EXT2_GOOD_OLD_INODE_SIZE;

// For directories, add +1 link count for the "." entry in self.
e2inode.i_links_count = is_directory(mode);

Expand Down Expand Up @@ -606,7 +627,7 @@ ErrorOr<void> Ext2FS::free_inode(Ext2FSInode& inode)
}

// NOTE: After this point, the inode metadata is wiped.
memset(&inode.m_raw_inode, 0, sizeof(ext2_inode));
memset(&inode.m_raw_inode, 0, sizeof(ext2_inode_large));
inode.m_raw_inode.i_dtime = kgettimeofday().truncated_seconds_since_epoch();
TRY(write_ext2_inode(inode.index(), inode.m_raw_inode));

Expand Down Expand Up @@ -707,7 +728,11 @@ ErrorOr<NonnullRefPtr<Ext2FSInode>> Ext2FS::build_root_inode() const
auto inode = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) Ext2FSInode(const_cast<Ext2FS&>(*this), EXT2_ROOT_INO)));

auto buffer = UserOrKernelBuffer::for_kernel_buffer(reinterpret_cast<u8*>(&inode->m_raw_inode));
TRY(read_block(block_index, &buffer, sizeof(ext2_inode), offset));

size_t size = min(inode_size(), sizeof(ext2_inode_large));
VERIFY(size >= EXT2_GOOD_OLD_INODE_SIZE);

TRY(read_block(block_index, &buffer, size, offset));
return inode;
}

Expand Down Expand Up @@ -744,7 +769,11 @@ ErrorOr<NonnullRefPtr<Inode>> Ext2FS::get_inode(InodeIdentifier inode) const
auto new_inode = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) Ext2FSInode(const_cast<Ext2FS&>(*this), inode.index())));

auto buffer = UserOrKernelBuffer::for_kernel_buffer(reinterpret_cast<u8*>(&new_inode->m_raw_inode));
TRY(read_block(block_index, &buffer, sizeof(ext2_inode), offset));

size_t size = min(inode_size(), sizeof(ext2_inode_large));
VERIFY(size >= EXT2_GOOD_OLD_INODE_SIZE);

TRY(read_block(block_index, &buffer, size, offset));

TRY(m_inode_cache.try_set(inode.index(), new_inode));
return new_inode;
Expand Down
6 changes: 5 additions & 1 deletion Kernel/FileSystem/Ext2FS/FileSystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,14 @@ class Ext2FS final : public BlockBasedFileSystem {
u64 inodes_per_group() const;
u64 blocks_per_group() const;
u64 inode_size() const;
u64 used_inode_size() const;

ErrorOr<NonnullRefPtr<Ext2FSInode>> build_root_inode() const;

ErrorOr<void> write_ext2_inode(InodeIndex, ext2_inode const&);
// NOTE: The large Ext2 inode structure is strictly superset of the classic 128-byte inode structure,
// so the this function simply ignores all the extra data if the filesystem doesn't support large inodes.
ErrorOr<void> write_ext2_inode(InodeIndex, ext2_inode_large const&);

bool find_block_containing_inode(InodeIndex, BlockIndex& block_index, unsigned& offset) const;

ErrorOr<void> flush_super_block();
Expand Down
76 changes: 63 additions & 13 deletions Kernel/FileSystem/Ext2FS/Inode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ static u8 to_ext2_file_type(mode_t mode)
return EXT2_FT_UNKNOWN;
}

bool Ext2FSInode::is_within_inode_bounds(FlatPtr base, FlatPtr value_offset, size_t value_size) const
{
if (value_offset - base - value_size < EXT2_GOOD_OLD_INODE_SIZE)
return true;

VERIFY(static_cast<u64>(EXT2_GOOD_OLD_INODE_SIZE + m_raw_inode.i_extra_isize) <= fs().inode_size());

return value_offset - base + value_size <= static_cast<u64>(EXT2_GOOD_OLD_INODE_SIZE + m_raw_inode.i_extra_isize);
}

ErrorOr<void> Ext2FSInode::write_singly_indirect_block_pointer(BlockBasedFileSystem::BlockIndex logical_block_index, BlockBasedFileSystem::BlockIndex on_disk_index)
{
auto const entries_per_block = EXT2_ADDR_PER_BLOCK(&fs().super_block());
Expand Down Expand Up @@ -417,10 +427,38 @@ InodeMetadata Ext2FSInode::metadata() const
metadata.uid = inode_uid(m_raw_inode);
metadata.gid = inode_gid(m_raw_inode);
metadata.link_count = m_raw_inode.i_links_count;
metadata.atime = UnixDateTime::from_seconds_since_epoch(m_raw_inode.i_atime);
metadata.ctime = UnixDateTime::from_seconds_since_epoch(m_raw_inode.i_ctime);
metadata.mtime = UnixDateTime::from_seconds_since_epoch(m_raw_inode.i_mtime);
metadata.dtime = UnixDateTime::from_seconds_since_epoch(m_raw_inode.i_dtime);

// NOTE: All the *_extra fields have to be bounds-checked in case we have oddly-sized inodes.
// This is simply a correctness measure, since an OOB read wouldn't happen anyway due to the
// fact that we always store the raw inode as an ext2_inode_large.
if (is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_atime_extra), sizeof(m_raw_inode.i_atime_extra))) {
time_t seconds = decode_seconds_with_extra(m_raw_inode.i_atime, m_raw_inode.i_atime_extra);
u32 nanoseconds = decode_nanoseconds_from_extra(m_raw_inode.i_atime_extra);
metadata.atime = UnixDateTime::from_unix_timespec({ .tv_sec = seconds, .tv_nsec = nanoseconds });
} else {
metadata.atime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_atime));
}

// NOTE: There's no i_dtime_extra, so we use i_ctime_extra to approximate the right epoch for metadata.dtime (like e2fsprogs.)
if (is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_ctime_extra), sizeof(m_raw_inode.i_ctime_extra))) {
time_t ctime_seconds = decode_seconds_with_extra(m_raw_inode.i_ctime, m_raw_inode.i_ctime_extra);
u32 ctime_nanoseconds = decode_nanoseconds_from_extra(m_raw_inode.i_ctime_extra);

metadata.ctime = UnixDateTime::from_unix_timespec({ .tv_sec = ctime_seconds, .tv_nsec = ctime_nanoseconds });
metadata.dtime = UnixDateTime::from_seconds_since_epoch(decode_seconds_with_extra(m_raw_inode.i_dtime, m_raw_inode.i_ctime_extra));
} else {
metadata.ctime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_ctime));
metadata.dtime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_dtime));
}

if (is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_mtime_extra), sizeof(m_raw_inode.i_mtime_extra))) {
time_t seconds = decode_seconds_with_extra(m_raw_inode.i_mtime, m_raw_inode.i_mtime_extra);
u32 nanoseconds = decode_nanoseconds_from_extra(m_raw_inode.i_mtime_extra);
metadata.mtime = UnixDateTime::from_unix_timespec({ .tv_sec = seconds, .tv_nsec = nanoseconds });
} else {
metadata.mtime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_mtime));
}

metadata.block_size = fs().logical_block_size();
metadata.block_count = m_raw_inode.i_blocks;

Expand Down Expand Up @@ -895,18 +933,30 @@ ErrorOr<void> Ext2FSInode::update_timestamps(Optional<UnixDateTime> atime, Optio
MutexLocker locker(m_inode_lock);
if (fs().is_readonly())
return EROFS;
if (atime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max())
if (atime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max() && !is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_atime_extra), sizeof(m_raw_inode.i_atime_extra)))
return EINVAL;
if (ctime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max())
if (ctime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max() && !is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_ctime_extra), sizeof(m_raw_inode.i_ctime_extra)))
return EINVAL;
if (mtime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max())
if (mtime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max() && !is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_mtime_extra), sizeof(m_raw_inode.i_mtime_extra)))
return EINVAL;
if (atime.has_value())
m_raw_inode.i_atime = atime.value().to_timespec().tv_sec;
if (ctime.has_value())
m_raw_inode.i_ctime = ctime.value().to_timespec().tv_sec;
if (mtime.has_value())
m_raw_inode.i_mtime = mtime.value().to_timespec().tv_sec;
if (atime.has_value()) {
time_t seconds = atime.value().to_timespec().tv_sec;
u32 nanoseconds = atime.value().to_timespec().tv_nsec;
m_raw_inode.i_atime = static_cast<u32>(seconds);
m_raw_inode.i_atime_extra = encode_time_to_extra(seconds, nanoseconds);
}
if (ctime.has_value()) {
time_t seconds = ctime.value().to_timespec().tv_sec;
u32 nanoseconds = ctime.value().to_timespec().tv_nsec;
m_raw_inode.i_ctime = static_cast<u32>(seconds);
m_raw_inode.i_ctime_extra = encode_time_to_extra(seconds, nanoseconds);
}
if (mtime.has_value()) {
time_t seconds = mtime.value().to_timespec().tv_sec;
u32 nanoseconds = mtime.value().to_timespec().tv_nsec;
m_raw_inode.i_mtime = static_cast<u32>(seconds);
m_raw_inode.i_mtime_extra = encode_time_to_extra(seconds, nanoseconds);
}
set_metadata_dirty(true);
return {};
}
Expand Down
8 changes: 7 additions & 1 deletion Kernel/FileSystem/Ext2FS/Inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ class Ext2FSInode final : public Inode {
virtual ErrorOr<void> truncate_locked(u64) override;
virtual ErrorOr<int> get_block_address(int) override;

bool is_within_inode_bounds(FlatPtr base, FlatPtr value_offset, size_t value_size) const;

static time_t decode_seconds_with_extra(i32 seconds, u32 extra) { return (extra & EXT4_EPOCH_MASK) ? static_cast<time_t>(seconds) + (static_cast<time_t>(extra & EXT4_EPOCH_MASK) << 32) : static_cast<time_t>(seconds); }
static u32 decode_nanoseconds_from_extra(u32 extra) { return (extra & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; }
static u32 encode_time_to_extra(time_t seconds, u32 nanoseconds) { return (((static_cast<time_t>(seconds) - static_cast<i32>(seconds)) >> 32) & EXT4_EPOCH_MASK) | (nanoseconds << EXT4_EPOCH_BITS); }

ErrorOr<BlockBasedFileSystem::BlockIndex> allocate_block(BlockBasedFileSystem::BlockIndex, bool zero_newly_allocated_block, bool allow_cache);
ErrorOr<u32> allocate_and_zero_block();

Expand Down Expand Up @@ -86,7 +92,7 @@ class Ext2FSInode final : public Inode {

mutable Ext2FSBlockView m_block_view;
HashMap<NonnullOwnPtr<KString>, InodeIndex> m_lookup_cache;
ext2_inode m_raw_inode {};
ext2_inode_large m_raw_inode {};
};

inline Ext2FS& Ext2FSInode::fs()
Expand Down
2 changes: 1 addition & 1 deletion Meta/build-image-extlinux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ dd if=/dev/zero of="${dev}${partition_number}" bs=1M count=1 status=none || die
echo "done"

printf "creating new filesystem... "
mke2fs -q -I 128 "${dev}${partition_number}" || die "couldn't create filesystem"
mke2fs -q "${dev}${partition_number}" || die "couldn't create filesystem"
echo "done"

printf "mounting filesystem... "
Expand Down
2 changes: 1 addition & 1 deletion Meta/build-image-grub.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ dd if=/dev/zero of="${dev}${partition_number}" bs=1M count=1 status=none || die
echo "done"

printf "creating new filesystem... "
mke2fs -q -I 128 "${dev}${partition_number}" || die "couldn't create filesystem"
mke2fs -q "${dev}${partition_number}" || die "couldn't create filesystem"
echo "done"

printf "mounting filesystem... "
Expand Down
2 changes: 1 addition & 1 deletion Meta/build-image-limine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ echo "done"

printf "creating new filesystems... "
mkfs.vfat -F 32 "${dev}p1" || die "couldn't create efi filesystem"
mke2fs -q -I 128 "${dev}p2" || die "couldn't create root filesystem"
mke2fs -q "${dev}p2" || die "couldn't create root filesystem"
echo "done"

printf "mounting filesystems... "
Expand Down
6 changes: 3 additions & 3 deletions Meta/build-image-qemu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ fi
# Prepend the toolchain qemu directory so we pick up QEMU from there
PATH="$SCRIPT_DIR/../Toolchain/Local/qemu/bin:$PATH"

INODE_SIZE=128
INODE_SIZE=256
INODE_COUNT=$(($(inode_usage "$SERENITY_SOURCE_DIR/Base") + $(inode_usage Root)))
INODE_COUNT=$((INODE_COUNT + 2000)) # Some additional inodes for toolchain files, could probably also be calculated
DISK_SIZE_BYTES=$((($(disk_usage "$SERENITY_SOURCE_DIR/Base") + $(disk_usage Root) ) * 1024 * 1024))
Expand Down Expand Up @@ -180,8 +180,8 @@ script_path=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
if [ $use_genext2fs = 1 ]; then
# regenerate new image, since genext2fs is unable to reuse the previously written image.
# genext2fs is very slow in generating big images, so I use a smaller image here. size can be updated
# if it's not enough.
# not using "-I $INODE_SIZE" since it hangs. Serenity handles whatever default this uses instead.
# if it's not enough. this also accounts for the fact that genext2fs only supports 128-byte inodes.
DISK_SIZE_BYTES=$((DISK_SIZE_BYTES - INODE_COUNT * 128))
genext2fs -B 4096 -b $((DISK_SIZE_BYTES / 4096)) -N "${INODE_COUNT}" -d mnt _disk_image || die "try increasing image size (genext2fs -b)"
# if using docker with shared mount, file is created as root, so make it writable for users
chmod 0666 _disk_image
Expand Down