From 1babc38902a184c9360c6c536b8c01080b112c0e Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Thu, 5 Oct 2023 12:49:42 +0800 Subject: [PATCH] feat: add datset viewer, remove treesitter languages that hasn't been verified (#509) * refactor: remove not verified tree sitter queries * feat(experimental): add dataset viewer update --- Cargo.lock | 55 ------------------- crates/tabby-scheduler/Cargo.toml | 5 -- crates/tabby-scheduler/src/dataset.rs | 77 --------------------------- experimental/dataset-viewer/main.py | 47 ++++++++++++++++ 4 files changed, 47 insertions(+), 137 deletions(-) create mode 100644 experimental/dataset-viewer/main.py diff --git a/Cargo.lock b/Cargo.lock index 994e53e..99c0906 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3194,14 +3194,9 @@ dependencies = [ "tokio", "tracing", "tracing-test", - "tree-sitter-go", - "tree-sitter-java", - "tree-sitter-javascript", - "tree-sitter-lua", "tree-sitter-python", "tree-sitter-rust", "tree-sitter-tags", - "tree-sitter-typescript", "walkdir", ] @@ -3861,46 +3856,6 @@ dependencies = [ "regex", ] -[[package]] -name = "tree-sitter-go" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad6d11f19441b961af2fda7f12f5d0dac325f6d6de83836a1d3750018cc5114" -dependencies = [ - "cc", - "tree-sitter", -] - -[[package]] -name = "tree-sitter-java" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0bf5d3f508cbffcbfe1805834101c0d24297a8b6c2184ad9c595556c46d2420" -dependencies = [ - "cc", - "tree-sitter", -] - -[[package]] -name = "tree-sitter-javascript" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2490fab08630b2c8943c320f7b63473cbf65511c8d83aec551beb9b4375906ed" -dependencies = [ - "cc", - "tree-sitter", -] - -[[package]] -name = "tree-sitter-lua" -version = "0.0.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0968cf4962ead1d26da28921dde1fd97407e7bbcf2f959cd20cf04ba2daa9421" -dependencies = [ - "cc", - "tree-sitter", -] - [[package]] name = "tree-sitter-python" version = "0.20.2" @@ -3933,16 +3888,6 @@ dependencies = [ "tree-sitter", ] -[[package]] -name = "tree-sitter-typescript" -version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "079c695c32d39ad089101c66393aeaca30e967fba3486a91f573d2f0e12d290a" -dependencies = [ - "cc", - "tree-sitter", -] - [[package]] name = "try-lock" version = "0.2.4" diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index 25988ea..7ec099c 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -12,7 +12,6 @@ job_scheduler = "1.2.1" tabby-common = { path = "../tabby-common" } tantivy = { workspace = true } tracing = { workspace = true } -tree-sitter-javascript = "0.20.0" tree-sitter-tags = "0.20.2" walkdir = "2.3.3" lazy_static = { workspace = true } @@ -21,10 +20,6 @@ serde-jsonlines = { workspace = true } file-rotate = "0.7.5" tree-sitter-python = "0.20.2" tree-sitter-rust = "0.20.3" -tree-sitter-go = "0.20.0" -tree-sitter-java = "0.20.0" -tree-sitter-typescript = "0.20.2" -tree-sitter-lua = "0.0.19" [dev-dependencies] temp_testdir = "0.2" diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs index 4459ca3..a78235b 100644 --- a/crates/tabby-scheduler/src/dataset.rs +++ b/crates/tabby-scheduler/src/dataset.rs @@ -248,83 +248,6 @@ lazy_static! { .unwrap(), ), ), - ( - "javascript", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_javascript::language(), - tree_sitter_javascript::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "jsx", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_javascript::language(), - tree_sitter_javascript::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "typescript", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_typescript::language_typescript(), - tree_sitter_typescript::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "tsx", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_typescript::language_tsx(), - tree_sitter_typescript::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "java", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_java::language(), - tree_sitter_java::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "go", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_go::language(), - tree_sitter_go::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "lua", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_lua::language(), - tree_sitter_lua::TAGS_QUERY, - "", - ) - .unwrap(), - ), - ), ]) }; } diff --git a/experimental/dataset-viewer/main.py b/experimental/dataset-viewer/main.py new file mode 100644 index 0000000..ffd338d --- /dev/null +++ b/experimental/dataset-viewer/main.py @@ -0,0 +1,47 @@ +import pandas as pd +import streamlit as st + +# force wide mode +st.set_page_config(layout="wide") + +st.write("Files") + +# read dataframe. +df = pd.read_json("~/.tabby/dataset/data.jsonl", lines = True) + +# remove useless columns +del df["git_url"] + +# filter df +df = df[df["max_line_length"] < 200] +df = df[df.apply(lambda x: len(x['tags']) > 0, axis=1)] + +selected = st.selectbox( + "Filename", + df.filepath, +) + +selected_row = df[df.filepath == selected].iloc[0] + +def get_range(lst, x): + return lst[x['start']:x['end']] + +if selected_row is not None: + kinds = set([x['syntax_type_name'] for x in selected_row.tags]) + enabled_kinds = st.multiselect("Displayed Kinds", kinds, default=kinds, key=selected_row.filepath) + col1, col2 = st.columns(2) + + content = selected_row.content + with col1: + st.write(f"File: {selected_row.filepath}") + st.code(content, line_numbers=True) + + with col2: + for tag in selected_row.tags: + name = get_range(content, tag['name_range']) + kind = tag['syntax_type_name'] + if kind not in enabled_kinds: + continue + is_definition = '✅' if tag['is_definition'] else '❌' + st.markdown(f"### `{name}`\nkind: {kind}, is_definition: {is_definition}") + st.code(get_range(content, tag['range'])) \ No newline at end of file