[gpt] saved_dir = out in_file = hf-internal-testing/tiny-random-gptj trained_gpu_num = 1 infer_gpu_num = 1 processes = 4 weight_data_type = fp32 vocab_size = 1000 n_positions = 512 n_embd = 32 n_layer = 5 n_head = 4 n_inner = None rotary_dim = 4 activation_function = gelu_new resid_pdrop = 0.0 embd_pdrop = 0.0 attn_pdrop = 0.0 layer_norm_epsilon = 1e-05 initializer_range = 0.02 use_cache = True bos_token_id = 98 eos_token_id = 98 return_dict = True output_hidden_states = False output_attentions = False torchscript = False torch_dtype = None use_bfloat16 = False tf_legacy_loss = False pruned_heads = {} tie_word_embeddings = False is_encoder_decoder = False is_decoder = False cross_attention_hidden_size = None add_cross_attention = False tie_encoder_decoder = False max_length = 20 min_length = 0 do_sample = False early_stopping = False num_beams = 1 num_beam_groups = 1 diversity_penalty = 0.0 temperature = 1.0 top_k = 50 top_p = 1.0 typical_p = 1.0 repetition_penalty = 1.0 length_penalty = 1.0 no_repeat_ngram_size = 0 encoder_no_repeat_ngram_size = 0 bad_words_ids = None num_return_sequences = 1 chunk_size_feed_forward = 0 output_scores = False return_dict_in_generate = False forced_bos_token_id = None forced_eos_token_id = None remove_invalid_values = False exponential_decay_length_penalty = None suppress_tokens = None begin_suppress_tokens = None architectures = None finetuning_task = None id2label = {0: 'LABEL_0', 1: 'LABEL_1'} label2id = {'LABEL_0': 0, 'LABEL_1': 1} tokenizer_class = None prefix = None pad_token_id = 98 sep_token_id = None decoder_start_token_id = None task_specific_params = None problem_type = None _name_or_path = hf-internal-testing/tiny-random-gptj _commit_hash = b96595a4bcdeb272096214589efa0314259853a0 transformers_version = 4.11.0.dev0 attention_probs_dropout_prob = 0.0 gradient_checkpointing = False hidden_act = gelu hidden_dropout_prob = 0.0 intermediate_size = 37 model_type = gptj n_ctx = 512 scale_attn_weights = True type_vocab_size = 16