The problem that is to be solved by this C program is that it should count the number of tags in an HTML file (which is inputted using input redirection) and then display all unique tags along with their counts. There is a restrictive definition of what a tag is considered to be: Assume an HTML tag starts right after '<' and terminates with either a space or '>', also the tag is to be only alphanumeric. For example, <h1>
and <span
are HTML tags while <!--
and </p>
are not. The program further takes input from the user from the command line. If the user inputs -a, then the tags are to be printed in alphabetic order, if -n then they are printed in descending order according to the count and if there is no input then the tags are just printed in the order they are stored with their counts. Other inputs are similar.
The issue that I am facing with my code is that sometimes, the same tag is printed twice such as span: 1 span: 1 when it should be span: 2 instead of separately printing its occurrences. Other tags get printed fine. I cannot find a specific order for why some tags get printed/counted this way. My other issue is that my program is also printing a peculiar tag of the form p or P(with some sort of symbol) which doesn't look right (see faulty output). My last issue is there is some problem with my method that sorts in descending order as it is not sorting correctly and it also causes a segmentation fault when I try to input different HTML files.
Required Output(No sort):
HTML Tags Found:
body: 1
div: 1
p: 2
b: 2
span: 2
Faulty Outputs(no sort and sort):
$ ./countTags < A1.html
HTML Tags Found:
P/3: 307
body: 1
div: 1
p: 2
b: 2
span: 1
span: 1
$ ./countTags -a < A1.html
HTML Tags Found:
Pz▒▒: 509
b: 2
body: 1
div: 1
p: 2
span: 1
span: 1
$ ./countTags -n < A1.html
HTML Tags Found:
P: 713
My Code:
#include "list.h"
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#define MAX_TAG_LEN 10
void insertNode(Node * head_ref, char newData[MAX_TAG_LEN])
{
Node * new_node = (Node *) malloc(sizeof(Node));
// Used in step 5
Node * last = head_ref;
// 2. Put in the data
strcpy(new_node->data, newData); //new_node->data = newData;
new_node->count = 1;
// 3. This new node is going to be the
// last node, so make next of it as NULL
new_node->next = NULL;
// 4. If the Linked List is empty, then make
// the new node as head
if (head_ref == NULL)
{
head_ref->next = new_node;
return;
}
// 5. Else traverse till the last node
while (last->next != NULL)
last = last->next;
// 6. Change the next of last node
last->next = new_node;
return;
}
void sortLinkedListAlphabetically(Node** head) {
Node* sortedList = NULL;
Node* unsortedList = *head;
while (unsortedList != NULL) {
Node* nodeToInsert = unsortedList;
unsortedList = unsortedList->next;
// Find the correct position to insert the node in the sorted list
if (sortedList == NULL || strcmp(nodeToInsert->data, sortedList->data) < 0) {
// Insert the node at the beginning of the sorted list
nodeToInsert->next = sortedList;
sortedList = nodeToInsert;
} else {
// Traverse the sorted list to find the correct position to insert the node
Node* current = sortedList;
while (current->next != NULL && strcmp(nodeToInsert->data, current->next->data) >= 0) {
current = current->next;
}
// Insert the node before the node at the current position
nodeToInsert->next = current->next;
current->next = nodeToInsert;
}
}
// Update the head of the original linked list to point to the head of the sorted list
*head = sortedList;
}
void sortListDescending(Node** head) {
Node* current = *head;
Node* prev = NULL;
Node* next = NULL;
int swapped = 1;
if (*head == NULL) {
return;
}
while (swapped) {
swapped = 0;
prev = NULL;
current = *head;
while (current->next != NULL) {
if (current->data < current->next->data) {
next = current->next;
current->next = next->next;
next->next = current;
if (prev != NULL) {
prev->next = next;
}
prev = next;
swapped = 1;
}
else {
prev = current;
current = current->next;
}
}
}
}
void printLinkedList(Node* head) {
Node* current = head;
while (current != NULL) {
printf("%s: %d\n", current->data, current->count);
current = current->next;
}
printf("\n");
}
void countTags(char input[])
{
Node * head = NULL;
head = (Node *)malloc(sizeof(Node));
int maxTags = 0;
int c;
int within_tag = 0;
char tagName[MAX_TAG_LEN];
int tagNameLen = 0;
while((c = getchar()) != EOF)
{
if(c == '<')
{
within_tag = 1;
tagNameLen = 0;
}
else if(c == '>' || c == ' ')
{
within_tag = 0;
tagName[tagNameLen] = '\0';
if(c == '>')
{ Node *last = head;
int found = 0;
//printf("%s ~", tagName);
//putchar('\n');
while(last->next != NULL)
{ //printf("%s *", last->data);
//putchar('\n');
if(strcmp(last->data, tagName) == 0)
{ //printf("%s - %s*",last->data, tagName);
//putchar('\n');
found = 1;
last->count++;
}
last = last->next;
}
if(found == 0)
{
int k;
int alpha1 = 1;
//printf("%s -", tagName);
//putchar('\n');
for(k=0; k<tagNameLen; k++)
{
if(isalnum(tagName[k]) == 0)
{
alpha1 = 0;
}
}
if(alpha1 == 1)
{
insertNode(head, tagName);
}
}
}
}
else if(within_tag)
{
if(tagNameLen < MAX_TAG_LEN )
{
tagName[tagNameLen] = c;
tagNameLen++;
}
}
}
if((strcmp(input, "") == 0))
{ printf("HTML Tags Found:\n");
printLinkedList(head);
}
else if((strcmp(input, "-a") == 0) || (strcmp(input, "-a-n") == 0) || (strcmp(input, "-n-a") == 0))
{
sortLinkedListAlphabetically(&(head->next));
printf("HTML Tags Found:\n");
printLinkedList(head);
}
else if((strcmp(input, "-n") == 0))
{
sortListDescending(&head);
printf("HTML Tags Found:\n");
printLinkedList(head);
}
}
int main(int argc, char *argv[])
{
char input1[3], input2[3];
if(argc == 2)
{
strcpy(input1, argv[1]);
countTags(input1);
}
else if(argc>2)
{
strcpy(input1, argv[1]);
strcpy(input2, argv[2]);
char* input3;
input3 = malloc(strlen(input1)+1+2);
strcpy(input3, input1);
strcat(input3, input2);
countTags(input3);
}
else
{
countTags("");
}
}
Header file for the struct:
#ifndef LIST_H
#define LIST_H
#define MAX_TAG_LEN 10
typedef struct listNode
{
char data[MAX_TAG_LEN];
int count;
struct listNode * next;
} Node;
#endif
HTML file inputted via input redirection:
<body lang=EN-CA link=blue vlink="#954F72">
<div class=WordSection1>
<p class=MsoNormal><b><span lang=EN-US style='font-size:14.0pt;font-family: "Times New Roman",serif'>Sample</span></b></p>
<p class=MsoNormal><b><span lang=EN-US style='font-size:14.0pt;font-family: "Times New Roman",serif'>Problem 0</span></b></p>