feat: add datset viewer, remove treesitter languages that hasn't been verified (#509)

* refactor: remove not verified tree sitter queries

* feat(experimental): add dataset viewer

update
wsxiaoys-patch-1
Meng Zhang 2023-10-05 12:49:42 +08:00 committed by GitHub
parent 55f68d4224
commit 1babc38902
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 137 deletions

55
Cargo.lock generated
View File

@ -3194,14 +3194,9 @@ dependencies = [
"tokio",
"tracing",
"tracing-test",
"tree-sitter-go",
"tree-sitter-java",
"tree-sitter-javascript",
"tree-sitter-lua",
"tree-sitter-python",
"tree-sitter-rust",
"tree-sitter-tags",
"tree-sitter-typescript",
"walkdir",
]
@ -3861,46 +3856,6 @@ dependencies = [
"regex",
]
[[package]]
name = "tree-sitter-go"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ad6d11f19441b961af2fda7f12f5d0dac325f6d6de83836a1d3750018cc5114"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-java"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0bf5d3f508cbffcbfe1805834101c0d24297a8b6c2184ad9c595556c46d2420"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-javascript"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2490fab08630b2c8943c320f7b63473cbf65511c8d83aec551beb9b4375906ed"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-lua"
version = "0.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0968cf4962ead1d26da28921dde1fd97407e7bbcf2f959cd20cf04ba2daa9421"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-python"
version = "0.20.2"
@ -3933,16 +3888,6 @@ dependencies = [
"tree-sitter",
]
[[package]]
name = "tree-sitter-typescript"
version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "079c695c32d39ad089101c66393aeaca30e967fba3486a91f573d2f0e12d290a"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "try-lock"
version = "0.2.4"

View File

@ -12,7 +12,6 @@ job_scheduler = "1.2.1"
tabby-common = { path = "../tabby-common" }
tantivy = { workspace = true }
tracing = { workspace = true }
tree-sitter-javascript = "0.20.0"
tree-sitter-tags = "0.20.2"
walkdir = "2.3.3"
lazy_static = { workspace = true }
@ -21,10 +20,6 @@ serde-jsonlines = { workspace = true }
file-rotate = "0.7.5"
tree-sitter-python = "0.20.2"
tree-sitter-rust = "0.20.3"
tree-sitter-go = "0.20.0"
tree-sitter-java = "0.20.0"
tree-sitter-typescript = "0.20.2"
tree-sitter-lua = "0.0.19"
[dev-dependencies]
temp_testdir = "0.2"

View File

@ -248,83 +248,6 @@ lazy_static! {
.unwrap(),
),
),
(
"javascript",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_javascript::language(),
tree_sitter_javascript::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"jsx",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_javascript::language(),
tree_sitter_javascript::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"typescript",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_typescript::language_typescript(),
tree_sitter_typescript::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"tsx",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_typescript::language_tsx(),
tree_sitter_typescript::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"java",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_java::language(),
tree_sitter_java::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"go",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_go::language(),
tree_sitter_go::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"lua",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_lua::language(),
tree_sitter_lua::TAGS_QUERY,
"",
)
.unwrap(),
),
),
])
};
}

View File

@ -0,0 +1,47 @@
import pandas as pd
import streamlit as st
# force wide mode
st.set_page_config(layout="wide")
st.write("Files")
# read dataframe.
df = pd.read_json("~/.tabby/dataset/data.jsonl", lines = True)
# remove useless columns
del df["git_url"]
# filter df
df = df[df["max_line_length"] < 200]
df = df[df.apply(lambda x: len(x['tags']) > 0, axis=1)]
selected = st.selectbox(
"Filename",
df.filepath,
)
selected_row = df[df.filepath == selected].iloc[0]
def get_range(lst, x):
return lst[x['start']:x['end']]
if selected_row is not None:
kinds = set([x['syntax_type_name'] for x in selected_row.tags])
enabled_kinds = st.multiselect("Displayed Kinds", kinds, default=kinds, key=selected_row.filepath)
col1, col2 = st.columns(2)
content = selected_row.content
with col1:
st.write(f"File: {selected_row.filepath}")
st.code(content, line_numbers=True)
with col2:
for tag in selected_row.tags:
name = get_range(content, tag['name_range'])
kind = tag['syntax_type_name']
if kind not in enabled_kinds:
continue
is_definition = '' if tag['is_definition'] else ''
st.markdown(f"### `{name}`\nkind: {kind}, is_definition: {is_definition}")
st.code(get_range(content, tag['range']))