Skip to content

Commit 971feea

Browse files
committed
bug fixes; profiling setup
1 parent 8e7d9d4 commit 971feea

File tree

15 files changed

+295
-74
lines changed

15 files changed

+295
-74
lines changed

hf_xet_wasm/Cargo.lock

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

hf_xet_wasm/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ web-sys = { version = "0.3.72", features = [
3636
] }
3737
js-sys = "0.3.72"
3838
futures = "0.3.31"
39-
sha2 = "0.10.8"
39+
sha2 = { version = "0.10.8", features = ["asm"] }
4040
blake3 = "1.7.0"
4141
getrandom = { version = "0.3", features = ["wasm_js"] }
4242
wasm_thread = "0.3"

hf_xet_wasm/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ cargo install wasm-bindgen-cli
99
- Build with `./build_wasm.sh` (bash)
1010

1111
#### Run Instructions
12-
Serve the web directory using a local http server, for example, https://crates.io/crates/sfz.
12+
First fill up the four `[FILL_ME]` fields in examples/index.html with a desired testing target.
13+
14+
Then serve the web directory using a local http server, for example, https://crates.io/crates/sfz.
1315

1416
- Install sfz:
1517
```bash

hf_xet_wasm/examples/commit.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/**
22
* Commits a file to a Hugging Face dataset.
33
*
4+
* @param {string} hf_endpoint - The HF Hub endpoint.
45
* @param {string} file_name The name of the file to commit.
56
* @param {string} sha256 The SHA256 hash of the file (as a string).
67
* @param {number} file_size The size of the file in bytes.
@@ -10,7 +11,7 @@
1011
* @param {string} hf_token The HF token for auth
1112
* @returns {Promise<string>} A promise that resolves if the commit is successful, or rejects if an error occurs.
1213
*/
13-
async function commit(endpoint, file_name, sha256, file_size, repo_type, repo_id, revision, hf_token) {
14+
async function commit(hf_endpoint, file_name, sha256, file_size, repo_type, repo_id, revision, hf_token) {
1415
const obj1 = {
1516
key: "header",
1617
value: {
@@ -32,7 +33,7 @@ async function commit(endpoint, file_name, sha256, file_size, repo_type, repo_id
3233
// Serialize to JSON string and concatenate with a newline.
3334
const body = `${JSON.stringify(obj1)}\n${JSON.stringify(obj2)}`;
3435

35-
const url = `${endpoint}/api/${repo_type}s/${repo_id}/commit/${revision}`;
36+
const url = `${hf_endpoint}/api/${repo_type}s/${repo_id}/commit/${revision}`;
3637

3738
try {
3839
const response = await fetch(url, {

hf_xet_wasm/examples/index.html

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,17 @@
1919
document.getElementById("file_picker").addEventListener(
2020
"change",
2121
async function () {
22-
const endpoint = "http://localhost:5564"
23-
const repo_type = "<your repo type>";
24-
const repo_id = "<your repo>";
25-
const hf_token = "<your token>";
22+
const hf_endpoint = "[FILL_ME]";
23+
const repo_type = "[FILL_ME]";
24+
const repo_id = "[FILL_ME]";
25+
const hf_token = "[FILL_ME]";
2626

2727
const file = this.files[0];
2828

2929
console.log("getting auth info...");
3030

3131
const xetmetadata = await fetchXetMetadataFromRepoInfo({
32-
endpoint,
32+
hfEndpoint: hf_endpoint,
3333
tokenType: "write",
3434
repoId: repo_id,
3535
repoType: repo_type,
@@ -76,21 +76,21 @@
7676

7777
const casEndpoint = xetmetadata.endpoint;
7878
console.log("casEndpoint:", casEndpoint);
79-
const tokenRefresher = new TokenRefresher();
80-
const tokenInfo = new TokenInfo(xetmetadata.accessToken, xetmetadata.expirationUnixEpoch);
81-
const xetSession = new XetSession(casEndpoint, tokenInfo, tokenRefresher);
79+
const tokenRefresher = new TokenRefresher();
80+
const tokenInfo = new TokenInfo(xetmetadata.accessToken, xetmetadata.expirationUnixEpoch);
81+
const xetSession = new XetSession(casEndpoint, tokenInfo, tokenRefresher);
8282

83-
const pf = await xetSession.uploadFileFromBlob("file_being_tracked", file);
84-
console.log("file cleaned to Xet session", pf);
85-
await xetSession.finalize();
83+
const pf = await xetSession.uploadFileFromBlob("file_being_tracked", file);
84+
console.log("file cleaned to Xet session", pf);
85+
await xetSession.finalize();
8686

8787
const file_name = file.name;
8888
const file_size = file.size;
8989

9090
const sha256 = pf.sha256;
9191
console.log("result: " + sha256);
9292

93-
let response = await commit(endpoint, file_name, sha256, file_size, repo_type, repo_id, "main", hf_token);
93+
let response = await commit(hf_endpoint, file_name, sha256, file_size, repo_type, repo_id, "main", hf_token);
9494
document.getElementById('result').textContent = response;
9595
},
9696
false

hf_xet_wasm/examples/simple.rs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ use futures::AsyncReadExt;
55
use hf_xet_wasm::blob_reader::BlobReader;
66
use hf_xet_wasm::configurations::{DataConfig, RepoSalt, ShardConfig, TranslatorConfig};
77
use hf_xet_wasm::wasm_file_upload_session::FileUploadSession;
8+
use hf_xet_wasm::wasm_timer::Timer;
9+
use log::Level;
810
use tokio::sync::mpsc;
911
use utils::auth::AuthConfig;
1012
use wasm_bindgen::prelude::*;
@@ -13,7 +15,7 @@ use wasm_thread as thread;
1315
fn main() {
1416
#[cfg(target_arch = "wasm32")]
1517
{
16-
console_log::init().unwrap();
18+
console_log::init_with_level(Level::Info).unwrap();
1719
console_error_panic_hook::set_once();
1820
}
1921

@@ -142,7 +144,10 @@ pub async fn test_async_blob_reader(file: web_sys::File) -> String {
142144
pub async fn clean_file(file: web_sys::File, endpoint: String, jwt_token: String, expiration: u64) -> String {
143145
log::debug!("clean_file called with {file:?}, {endpoint}, {jwt_token}, {expiration}");
144146

147+
let _timer = Timer::new_enforce_report("clean file main");
148+
145149
let filename = file.name();
150+
let filesize = file.size();
146151

147152
let Ok(blob) = file.slice() else {
148153
log::error!("failed to convert a file to blob");
@@ -169,16 +174,19 @@ pub async fn clean_file(file: web_sys::File, endpoint: String, jwt_token: String
169174

170175
let upload_session = Arc::new(FileUploadSession::new(Arc::new(config)));
171176

172-
let mut handle = upload_session.start_clean(filename);
177+
let mut handle = upload_session.start_clean(filename, None);
173178

174179
const READ_BUF_SIZE: usize = 8 * 1024 * 1024;
175180
let mut buf = vec![0u8; READ_BUF_SIZE];
176181
let mut total_read = 0;
182+
let mut last_report = 0.;
177183
loop {
184+
let _timer = Timer::new(format!("read file at {total_read}"));
178185
let Ok(bytes) = reader.read(&mut buf).await else {
179186
log::error!("failed to read from reader");
180187
return "".to_owned();
181188
};
189+
drop(_timer);
182190
if bytes == 0 {
183191
break;
184192
}
@@ -192,7 +200,13 @@ pub async fn clean_file(file: web_sys::File, endpoint: String, jwt_token: String
192200
return "".to_owned();
193201
};
194202

195-
log::debug!("processed {total_read} bytes");
203+
log::debug!("read {total_read} bytes");
204+
205+
let percentage = total_read as f64 / filesize * 100.;
206+
if (percentage - last_report) > 10. {
207+
log::info!("processing {percentage:.2}% of file");
208+
last_report = percentage;
209+
}
196210
}
197211
let Ok((file_hash, sha256, _metrics)) = handle.finish().await else {
198212
log::error!("failed to finish cleaner");

hf_xet_wasm/examples/xet_meta.js

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,17 @@ function xetMetadataOrNone(jsonData) {
2727
}
2828

2929
async function fetchXetMetadataFromRepoInfo({
30-
endpoint,
31-
tokenType,
32-
repoId,
33-
repoType,
34-
headers,
35-
params = null,
36-
}) {
30+
hfEndpoint,
31+
tokenType,
32+
repoId,
33+
repoType,
34+
headers,
35+
params = null
36+
}) {
3737
/**
3838
* Uses the repo info to request a XET access token from Hub.
3939
*
40+
* @param {string} hfEndpoint - The HF Hub endpoint.
4041
* @param {string} tokenType - Type of the token to request: "read" or "write".
4142
* @param {string} repoId - A namespace (user or an organization) and a repo name separated by a `/`.
4243
* @param {string} repoType - Type of the repo to upload to: "model", "dataset", or "space".
@@ -46,7 +47,7 @@ async function fetchXetMetadataFromRepoInfo({
4647
* @throws {Error} If the Hub API returned an error or the response is improperly formatted.
4748
*/
4849

49-
const url = `${endpoint}/api/${repoType}s/${repoId}/xet-${tokenType}-token/main`;
50+
const url = `${hfEndpoint}/api/${repoType}s/${repoId}/xet-${tokenType}-token/main`;
5051
console.log(`${url}`);
5152

5253
return fetchXetMetadataWithUrl(url, headers, params);

hf_xet_wasm/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ mod sha256;
1010
mod wasm_deduplication_interface;
1111
mod wasm_file_cleaner;
1212
pub mod wasm_file_upload_session;
13-
13+
pub mod wasm_timer;
1414
mod xorb_uploader;
1515

1616
pub use session::XetSession;

hf_xet_wasm/src/session.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ impl XetSession {
6767
#[wasm_bindgen(js_name = "uploadFileFromBlob")]
6868
pub async fn upload_file_from_blob(&mut self, tracker_id: String, blob: Blob) -> Result<JsValue, JsValue> {
6969
// read from blob async
70-
let mut cleaner = self.upload.start_clean(tracker_id);
70+
let mut cleaner = self.upload.start_clean(tracker_id, None);
7171

7272
let mut reader = BlobReader::new(blob)?;
7373

hf_xet_wasm/src/sha256.rs

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,44 @@
1-
use std::sync::Arc;
2-
31
use deduplication::Chunk;
42
use merklehash::MerkleHash;
53
use sha2::{Digest, Sha256};
64

75
use super::errors::*;
86

7+
pub enum ShaGeneration {
8+
Value(MerkleHash),
9+
Action(ShaGenerator),
10+
}
11+
12+
impl ShaGeneration {
13+
pub fn new(hash: Option<MerkleHash>) -> Self {
14+
match hash {
15+
Some(h) => Self::Value(h),
16+
None => Self::Action(ShaGenerator::new()),
17+
}
18+
}
19+
20+
pub fn update(&mut self, new_chunks: &[Chunk]) {
21+
match self {
22+
ShaGeneration::Value(_) => {},
23+
ShaGeneration::Action(sha_generator) => sha_generator.update(new_chunks),
24+
}
25+
}
26+
27+
pub fn update_with_bytes(&mut self, new_bytes: &[u8]) {
28+
match self {
29+
ShaGeneration::Value(_) => {},
30+
ShaGeneration::Action(sha_generator) => sha_generator.update_with_bytes(new_bytes),
31+
}
32+
}
33+
34+
pub fn finalize(self) -> Result<MerkleHash> {
35+
match self {
36+
ShaGeneration::Value(hash) => Ok(hash),
37+
ShaGeneration::Action(sha_generator) => sha_generator.finalize(),
38+
}
39+
}
40+
}
41+
942
pub struct ShaGenerator {
1043
hasher: Sha256,
1144
}
@@ -17,13 +50,17 @@ impl ShaGenerator {
1750
}
1851
}
1952

20-
pub async fn update(&mut self, new_chunks: Arc<[Chunk]>) {
53+
pub fn update(&mut self, new_chunks: &[Chunk]) {
2154
for chunk in new_chunks.iter() {
2255
self.hasher.update(&chunk.data);
2356
}
2457
}
2558

26-
pub async fn finalize(self) -> Result<MerkleHash> {
59+
pub fn update_with_bytes(&mut self, new_bytes: &[u8]) {
60+
self.hasher.update(new_bytes);
61+
}
62+
63+
pub fn finalize(self) -> Result<MerkleHash> {
2764
let sha256 = self.hasher.finalize();
2865
let hex_str = format!("{sha256:x}");
2966
Ok(MerkleHash::from_hex(&hex_str)?)

0 commit comments

Comments
 (0)