lynx-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

lynx-dev optimization Re: internal links (patch11)


From: Leonid Pauzner
Subject: lynx-dev optimization Re: internal links (patch11)
Date: Tue, 5 Nov 2002 02:09:42 +0300 (MSK)

One more patch that optimize parsing of large html, with many anchors.

Sorry, I do not know the release schedule - I sent too many small patches...
May pick them together if there is a problem. Looking forward dev10.


* optimization for parsing html with many relative links, href="#fragment" -
 HTAnchor_findChildAndLink() and HTML_start_element(), case HTML_A:
 now avoid significant overhead when link == LINK_INTERNAL (e.g. resolving
 against base, lots of reallocations, parent lookup, etc. all are useless).
 Two functions affected.  [HTAnchor.c, HTML.c].  The code work both with and
 without DONT_TRACK_INTERNAL_LINKS symbol.


This patch applied on top of my previous patches,
...it also undo my patch #7 re: HTParseAnchor() in HTParse.[c,h]




diff -u -p -r LYNX2-8-.590/src/html.c LYNX2-8-/src/html.c
--- LYNX2-8-.590/src/html.c     Sun Oct  6 17:43:28 2002
+++ LYNX2-8-/src/html.c Mon Nov  4 22:55:26 2002
@@ -661,7 +661,6 @@ PUBLIC void HTML_write ARGS3(HTStructure
  *  context an internal link makes no sense (e.g., IMG SRC=).
  */

-#ifndef DONT_TRACK_INTERNAL_LINKS
 /* A flag is used to keep track of whether an "URL reference" encountered
    had a real "URL" or not.  In the latter case, it will be marked as
    "internal". The flag is set before we start messing around with the
@@ -674,12 +673,6 @@ PUBLIC void HTML_write ARGS3(HTStructure
    just an abbreviation. - kw */
 #define INTERN_LT (HTLinkType *)(intern_flag ? LINK_INTERNAL : NULL)

-#else  /* !DONT_TRACK_INTERNAL_LINKS */
-
-#define CHECK_FOR_INTERN(flag,s)  /* do nothing */ ;
-#define INTERN_LT (HTLinkType *)NULL
-
-#endif /* DONT_TRACK_INTERNAL_LINKS */

 #ifdef USE_COLOR_STYLE
 # if !OPT_SCN
@@ -3020,78 +3013,80 @@ PRIVATE int HTML_start_element ARGS6(
                   value[HTML_A_NAME] && *value[HTML_A_NAME]) {
            StrAllocCopy(id_string, value[HTML_A_NAME]);
        }
-       if (id_string) {
+       if (id_string)
            TRANSLATE_AND_UNESCAPE_TO_STD(&id_string);
-           if (*id_string == '\0') {
-               FREE(id_string);
-           }
-       }

        /*
         *  Handle the reference. - FM
         */
        if (present && present[HTML_A_HREF]) {
-#ifndef DONT_TRACK_INTERNAL_LINKS
-           if (present[HTML_A_ISMAP])
-               intern_flag = FALSE;
-           else
-               CHECK_FOR_INTERN(intern_flag,value[HTML_A_HREF]);
-#endif
            /*
-            *  Prepare to do housekeeping on the reference. - FM
+            *  Set to know we are making the content bold.
             */
-           if (!value[HTML_A_HREF] || *value[HTML_A_HREF] == '\0') {
-               StrAllocCopy(href, me->node_anchor->address);
-           } else if (*value[HTML_A_HREF] == '#') {
-               StrAllocCopy(href, me->node_anchor->address);
-               if (strlen(value[HTML_A_HREF]) > 1) {
-                   StrAllocCat(href, value[HTML_A_HREF]);
-               }
-           } else {
+           me->inBoldA = TRUE;
+
+           CHECK_FOR_INTERN(intern_flag,value[HTML_A_HREF]);
+           if (present[HTML_A_ISMAP]) /*???*/
+               intern_flag = FALSE;
+
+           if (intern_flag) {
+               /*** FAST WAY: ***/
                StrAllocCopy(href, value[HTML_A_HREF]);
-           }
-           url_type = LYLegitimizeHREF(me, &href, TRUE, TRUE);
+               if (href && *href)
+                   TRANSLATE_AND_UNESCAPE_TO_STD(&href);

-           /*
-            *  Deal with our ftp gateway kludge. - FM
-            */
-           if (!url_type && !strncmp(href, "/foo/..", 7) &&
-               (isFTP_URL(me->node_anchor->address) ||
-                isFILE_URL(me->node_anchor->address))) {
-               for (i = 0; (href[i] = href[i+7]) != 0; i++)
-                   ;
-           }
+           } else {
+               /*
+                *      Prepare to do housekeeping on the reference. - FM
+                */
+               if (!value[HTML_A_HREF] || *value[HTML_A_HREF] == '\0') {
+                   StrAllocCopy(href, me->node_anchor->address);
+               } else if (*value[HTML_A_HREF] == '#') {
+                   StrAllocCopy(href, me->node_anchor->address);
+                   if (strlen(value[HTML_A_HREF]) > 1) {
+                       StrAllocCat(href, value[HTML_A_HREF]);
+                   }
+               } else {
+                   StrAllocCopy(href, value[HTML_A_HREF]);
+               }
+               url_type = LYLegitimizeHREF(me, &href, TRUE, TRUE);

-           /*
-            *  Set to know we are making the content bold.
-            */
-           me->inBoldA = TRUE;
+               /*
+                *      Deal with our ftp gateway kludge. - FM
+                */
+               if (!url_type && !strncmp(href, "/foo/..", 7) &&
+                     (isFTP_URL(me->node_anchor->address) ||
+                      isFILE_URL(me->node_anchor->address))) {
+                   for (i = 0; (href[i] = href[i+7]) != 0; i++)
+                       ;
+               }

-           /*
-            *  Check whether a base tag is in effect. - FM
-            */
-           if ((me->inBASE && *href != '\0' && *href != '#') &&
-               (temp = HTParse(href, me->base_href, PARSE_ALL)) &&
-               *temp != '\0')
                /*
-                *  Use reference related to the base.
+                *      Check whether a base tag is in effect. - FM
                 */
-               StrAllocCopy(href, temp);
-           FREE(temp);
+               if ((me->inBASE && *href != '\0' && *href != '#') &&
+                     (temp = HTParse(href, me->base_href, PARSE_ALL)) &&
+                      *temp != '\0')
+                   /*
+                    *  Use reference related to the base.
+                    */
+                   StrAllocCopy(href, temp);
+               FREE(temp);

-           /*
-            *  Check whether to fill in localhost. - FM
-            */
-           LYFillLocalFileURL(&href,
-                              ((*href != '\0' && *href != '#' &&
-                                me->inBASE) ?
-                              me->base_href : me->node_anchor->address));
+               /*
+                *      Check whether to fill in localhost. - FM
+                */
+               LYFillLocalFileURL(&href,
+                               ((*href != '\0' && *href != '#' &&
+                                  me->inBASE) ?
+                                me->base_href : me->node_anchor->address));
+           }
        } else {
            if (bold_name_anchors == TRUE) {
                me->inBoldA = TRUE;
            }
        }
-#ifndef DONT_TRACK_INTERNAL_LINKS
+
        if (present && present[HTML_A_TYPE] && value[HTML_A_TYPE]) {
            StrAllocCopy(temp, value[HTML_A_TYPE]);
            if (!intern_flag && href &&
@@ -3108,7 +3103,6 @@ PRIVATE int HTML_start_element ARGS6(
                FREE(temp);
            }
        }
-#endif /* DONT_TRACK_INTERNAL_LINKS */

        me->CurrentA = HTAnchor_findChildAndLink(
                        me->node_anchor,                        /* Parent */
diff -u -p -r LYNX2-8-.590/src/lymainlo.c LYNX2-8-/src/lymainlo.c
--- LYNX2-8-.590/src/lymainlo.c Sun Oct  6 17:43:28 2002
+++ LYNX2-8-/src/lymainlo.c     Mon Nov  4 22:55:30 2002
@@ -158,6 +158,7 @@ PRIVATE int str_n_cmp(const char *p, con
 #include <LYexit.h>
 #include <LYLeaks.h>

+PUBLIC HTLinkType * LINK_INTERNAL = 0;

 #ifndef DONT_TRACK_INTERNAL_LINKS
 #define NO_INTERNAL_OR_DIFFERENT(c,n) TRUE
@@ -5233,6 +5234,12 @@ int mainloop NOARGS
     unsigned int len;
     int i;
     int follow_col = -1, key_count = 0, last_key = 0;
+
+/*  "internal" means "within the same document, with certainty".
+    It includes a space so it cannot conflict with any (valid) "TYPE"
+    attributes on A elements. [According to which DTD, anyway??] - kw
+ */
+    LINK_INTERNAL = HTAtom_for("internal link");  /* init */

 /*
  *  curdoc.address contains the name of the file that is currently open.
diff -u -p -r LYNX2-8-.590/www/library/implemen/htanchor.h 
LYNX2-8-/www/library/implemen/htanchor.h
--- LYNX2-8-.590/www/library/implemen/htanchor.h        Fri Oct 18 03:56:38 2002
+++ LYNX2-8-/www/library/implemen/htanchor.h    Mon Nov  4 22:55:04 2002
@@ -131,11 +131,8 @@ typedef struct _DocAddress {
     BOOL   safe;
 } DocAddress;

-/* "internal" means "within the same document, with certainty".
-   It includes a space so it cannot conflict with any (valid) "TYPE"
-   attributes on A elements. [According to which DTD, anyway??] - kw */
-
-#define LINK_INTERNAL HTAtom_for("internal link")
+/* "internal" means "within the same document, with certainty". */
+extern HTLinkType * LINK_INTERNAL;

 /*     Create new or find old sub-anchor
 **     ---------------------------------

diff -u -p -r LYNX2-8-.590/www/library/implemen/htanchor.c 
LYNX2-8-/www/library/implemen/htanchor.c
--- LYNX2-8-.590/www/library/implemen/htanchor.c        Sun Oct  6 17:43:28 2002
+++ LYNX2-8-/www/library/implemen/htanchor.c    Tue Nov  5 01:11:24 2002
@@ -70,34 +70,14 @@ PRIVATE HTParentAnchor * HTParentAnchor_
     HTParentAnchor *newAnchor = typecalloc(HTParentAnchor);
     if (newAnchor == NULL)
        outofmem(__FILE__, "HTParentAnchor_new");
+    /* calloc: all pointers initialized to NULL */
+
     newAnchor->parent = newAnchor;
-    newAnchor->bookmark = NULL;                /* Bookmark filename. - FM */
     newAnchor->isISMAPScript = FALSE;  /* Lynx appends ?0,0 if TRUE. - FM */
     newAnchor->isHEAD = FALSE;         /* HEAD request if TRUE. - FM */
     newAnchor->safe = FALSE;           /* Safe. - FM */
-#ifdef SOURCE_CACHE
-    newAnchor->source_cache_file = NULL;
-    newAnchor->source_cache_chunk = NULL;
-#endif
-    newAnchor->FileCache = NULL;       /* Path to a disk-cached copy. - FM */
-    newAnchor->SugFname = NULL;                /* Suggested filename. - FM */
-    newAnchor->RevTitle = NULL;                /* TITLE for a LINK with REV. - 
FM */
-    newAnchor->citehost = NULL;                /* LINK REL=citehost - RDC */
-    newAnchor->cache_control = NULL;   /* Cache-Control. - FM */
     newAnchor->no_cache = FALSE;       /* no-cache? - FM */
-    newAnchor->content_type = NULL;    /* Content-Type. - FM */
-    newAnchor->content_language = NULL; /* Content-Language. - FM */
-    newAnchor->content_encoding = NULL; /* Compression algorithm. - FM */
-    newAnchor->content_base = NULL;    /* Content-Base. - FM */
-    newAnchor->content_disposition = NULL; /* Content-Disposition. - FM */
-    newAnchor->content_location = NULL; /* Content-Location. - FM */
-    newAnchor->content_md5 = NULL;     /* Content-MD5. - FM */
     newAnchor->content_length = 0;     /* Content-Length. - FM */
-    newAnchor->date = NULL;            /* Date. - FM */
-    newAnchor->expires = NULL;         /* Expires. - FM */
-    newAnchor->last_modified = NULL;   /* Last-Modified. - FM */
-    newAnchor->ETag = NULL;            /* ETag (HTTP/1.1 cache validator) */
-    newAnchor->server = NULL;          /* Server. - FM */
     return(newAnchor);
 }

@@ -255,6 +235,10 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
 }


+PRIVATE HTParentAnchor * HTAnchor_findAddress_nofragment PARAMS((
+       CONST DocAddress *      newdoc));
+
+
 /*     Create or find a child anchor with a possible link
 **     --------------------------------------------------
 **
@@ -270,30 +254,46 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
 {
     HTChildAnchor * child = HTAnchor_findChild(parent, tag);

-    CTRACE((tfp,"Entered HTAnchor_findChildAndLink\n"));
+    CTRACE((tfp,"Entered HTAnchor_findChildAndLink:  tag=`%s',%s href=`%s'\n",
+               NonNull(tag),
+               (ltype == LINK_INTERNAL) ? " (internal link)" : "",
+               NonNull(href) ));

     if (href && *href) {
-       char *relative_to = HTAnchor_address((HTAnchor *)parent);
+       CONST char *fragment = NULL;
        DocAddress parsed_doc;
-       HTAnchor * dest;
+       HTParentAnchor * dest;

-       parsed_doc.address = HTParse(href, relative_to, PARSE_ALL);
-#ifndef DONT_TRACK_INTERNAL_LINKS
-       if (ltype && parent->post_data && ltype == LINK_INTERNAL) {
-           /* for internal links, find a destination with the same
-              post data if the source of the link has post data. - kw */
-           parsed_doc.post_data = parent->post_data;
-           parsed_doc.post_content_type = parent->post_content_type;
-       } else
-#endif
-       {
+       if (ltype == LINK_INTERNAL) {
+           dest = parent;
+           fragment = href+1;
+       } else {
+           char *relative_to = HTAnchor_address((HTAnchor *)parent);
+           /* hmm, it seems HTML.c always resolve href to absolute url??? */
+           parsed_doc.address = HTParse(href, relative_to,
+               PARSE_ACCESS | PARSE_HOST | PARSE_PATH | PARSE_PUNCTUATION);
            parsed_doc.post_data = NULL;
            parsed_doc.post_content_type = NULL;
+           parsed_doc.bookmark = NULL;
+           parsed_doc.isHEAD = FALSE;
+           parsed_doc.safe = FALSE;
+           dest = HTAnchor_findAddress_nofragment(&parsed_doc);
+           FREE(relative_to);
+           FREE(parsed_doc.address);
+           fragment = HTParse(href, "", PARSE_ANCHOR);
        }
-       parsed_doc.bookmark = NULL;
-       parsed_doc.isHEAD = FALSE;
-       parsed_doc.safe = FALSE;
-       dest = HTAnchor_findAddress(&parsed_doc);
+
+       /*
+       ** [comment from HTAnchor_findAddress()]
+       ** If the address represents a sub-anchor, we load its parent,
+       ** then we create a child anchor within that document.
+       */
+       if (*fragment)
+           dest = (HTParentAnchor *)HTAnchor_findChild(dest, fragment);
+
+
+       if (ltype != LINK_INTERNAL)
+           FREE(fragment);

 #define DUPLICATE_ANCHOR_NAME_WORKAROUND

@@ -307,7 +307,7 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
                CTRACE((tfp,
                       "*** Duplicate ChildAnchor %p named `%s' with %d links",
                       child, tag, child_links));
-               if (dest == testdest1 && ltype == child->mainLink.type) {
+               if ((HTAnchor *)dest == testdest1 && ltype == 
child->mainLink.type) {
                    CTRACE((tfp,", same dest %p and type, keeping it\n",
                           testdest1));
                } else {
@@ -318,13 +318,12 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
            }
        }
 #endif
-       HTAnchor_link((HTAnchor *)child, dest, ltype);
-       FREE(parsed_doc.address);
-       FREE(relative_to);
+       HTAnchor_link((HTAnchor *)child, (HTAnchor *)dest, ltype);
     }
     return(child);
 }

+
 #ifdef LY_FIND_LEAKS
 /*
 **  Function for freeing the adult hash table. - FM
@@ -376,7 +375,7 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
        CONST DocAddress *,     newdoc)
 {
     /* Anchor tag specified ? */
-    char *tag = HTParseAnchor(newdoc->address);
+    char *tag = HTParse(newdoc->address, "", PARSE_ANCHOR);

     CTRACE((tfp,"Entered HTAnchor_findAddress\n"));

@@ -384,7 +383,7 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
     ** If the address represents a sub-anchor, we recursively load its
     ** parent, then we create a child anchor within that document.
     */
-    if (tag && *tag) {
+    if (*tag) {
        DocAddress parsed_doc;
        HTParentAnchor * foundParent;
        HTChildAnchor * foundAnchor;
@@ -397,14 +396,22 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
        parsed_doc.isHEAD = newdoc->isHEAD;
        parsed_doc.safe = newdoc->safe;

-       foundParent = (HTParentAnchor *)HTAnchor_findAddress(&parsed_doc);
+       foundParent = HTAnchor_findAddress_nofragment(&parsed_doc);
        foundAnchor = HTAnchor_findChild (foundParent, tag);
        FREE(parsed_doc.address);
        FREE(tag);
        return (HTAnchor *)foundAnchor;
-    } else {
+    }
+    FREE(tag);
+    return (HTAnchor *)HTAnchor_findAddress_nofragment(newdoc);
+}
+
+/*  The address has no anchor tag for sure.
+ */
+PRIVATE HTParentAnchor * HTAnchor_findAddress_nofragment ARGS1(
+       CONST DocAddress *,     newdoc)
+{
        /*
-       **  If the address has no anchor tag,
        **  check whether we have this node.
        */
        int hash;
@@ -412,8 +419,6 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
        HTList *grownups;
        HTParentAnchor * foundAnchor;

-       FREE(tag);
-
        /*
        **  Select list from hash table,
        */
@@ -448,7 +453,7 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
            {
                CTRACE((tfp, "Anchor %p with address `%s' already exists.\n",
                            (void *)foundAnchor, newdoc->address));
-                return (HTAnchor *)foundAnchor;
+                return foundAnchor;
             }
        }

@@ -469,13 +474,14 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
        foundAnchor->isHEAD = newdoc->isHEAD;
        foundAnchor->safe = newdoc->safe;
        HTList_addObject (adults, foundAnchor);
-       return (HTAnchor *)foundAnchor;
-    }
+
+       return foundAnchor;
 }
+
 /*     Create new or find old named anchor - simple form
 **     -------------------------------------------------
 **
-**     Like the previous one, but simpler to use for simple cases.
+**     Like HTAnchor_findAddress, but simpler to use for simple cases.
 **     No post data etc. can be supplied. - kw
 */
 PUBLIC HTAnchor * HTAnchor_findSimpleAddress ARGS1(
diff -u -p -r LYNX2-8-.590/www/library/implemen/htparse.c 
LYNX2-8-/www/library/implemen/htparse.c
--- LYNX2-8-.590/www/library/implemen/htparse.c Sat Nov  2 22:11:02 2002
+++ LYNX2-8-/www/library/implemen/htparse.c     Mon Nov  4 22:55:08 2002
@@ -485,29 +485,6 @@ PUBLIC char * HTParse ARGS3(
 }


-PUBLIC char * HTParseAnchor ARGS1(
-       CONST char *,   aName)
-{
-    if (!aName)
-       return 0;
-
-    if (!strncasecomp(aName, "http://";, 7) ||
-               !strncasecomp(aName, "file://", 7) ||
-               !strncasecomp(aName, "https://";, 8)) { /* fast way */
-       CONST char * p;
-       for (p = aName; *p && *p != '#'; p++)
-           ;
-       if (*p++) {
-           char * res = 0;
-           StrAllocCopy(res, p);
-           return res;
-       }
-       return 0;
-    }
-    return HTParse(aName, "", PARSE_ANCHOR);  /* may have unescaped hashes */
-}
-
-
 /*     Simplify a filename.                            HTSimplify()
 **     --------------------
 **
diff -u -p -r LYNX2-8-.590/www/library/implemen/htparse.h 
LYNX2-8-/www/library/implemen/htparse.h
--- LYNX2-8-.590/www/library/implemen/htparse.h Mon Nov  4 21:15:42 2002
+++ LYNX2-8-/www/library/implemen/htparse.h     Mon Nov  4 22:55:04 2002
@@ -70,10 +70,6 @@ extern char * HTParse PARAMS((
        CONST char *    relatedName,
        int             wanted));

-extern char * HTParseAnchor PARAMS((  /* faster then HTParse() */
-       CONST char *    aName));
-
-
 /*     Simplify a filename.                            HTSimplify()
 **     --------------------
 **


; To UNSUBSCRIBE: Send "unsubscribe lynx-dev" to address@hidden

reply via email to

[Prev in Thread] Current Thread [Next in Thread]